def experiment(config): iterations = config.pop("train-iterations") train_agent = ppo.PPO(config=config, env="CartPole-v0") checkpoint = None train_results = {} # Train for i in range(iterations): train_results = train_agent.train() if i % 2 == 0 or i == iterations - 1: checkpoint = train_agent.save(tune.get_trial_dir()) tune.report(**train_results) train_agent.stop() # Manual Eval config["num_workers"] = 0 eval_agent = ppo.PPO(config=config, env="CartPole-v0") eval_agent.restore(checkpoint) env = eval_agent.workers.local_worker().env obs = env.reset() done = False eval_results = {"eval_reward": 0, "eval_eps_length": 0} while not done: action = eval_agent.compute_single_action(obs) next_obs, reward, done, info = env.step(action) eval_results["eval_reward"] += reward eval_results["eval_eps_length"] += 1 results = {**train_results, **eval_results} tune.report(results)
def test_ppo_legacy_config(self): """Tests, whether the old PPO config dict is still functional.""" ppo_config = ppo.DEFAULT_CONFIG # Expect warning. print(f"Accessing learning-rate from legacy config dict: {ppo_config['lr']}") # Build Algorithm. ppo_trainer = ppo.PPO(config=ppo_config, env="CartPole-v1") print(ppo_trainer.train())
def test_dont_import_torch_error(): """Check that an error is thrown when torch isn't installed but we try to run a torch experiment. """ # Do not import tf for testing purposes. os.environ["RLLIB_TEST_NO_TORCH_IMPORT"] = "1" config = {"framework": "torch"} with pytest.raises(ImportError, match="However, there was no installation found."): ppo.PPO(config, env="CartPole-v1")
def test_curiosity_on_frozen_lake(self): config = ppo.DEFAULT_CONFIG.copy() # A very large frozen-lake that's hard for a random policy to solve # due to 0.0 feedback. config["env"] = "FrozenLake-v1" config["env_config"] = { "desc": [ "SFFFFFFF", "FFFFFFFF", "FFFFFFFF", "FFFFFFFF", "FFFFFFFF", "FFFFFFFF", "FFFFFFFF", "FFFFFFFG", ], "is_slippery": False, } # Print out observations to see how far we already get inside the Env. config["callbacks"] = MyCallBack # Limit horizon to make it really hard for non-curious agent to reach # the goal state. config["horizon"] = 16 # Local only. config["num_workers"] = 0 config["lr"] = 0.001 num_iterations = 10 for _ in framework_iterator(config, frameworks=("tf", "torch")): # W/ Curiosity. Expect to learn something. config["exploration_config"] = { "type": "Curiosity", "eta": 0.2, "lr": 0.001, "feature_dim": 128, "feature_net_config": { "fcnet_hiddens": [], "fcnet_activation": "relu", }, "sub_exploration": { "type": "StochasticSampling", }, } algo = ppo.PPO(config=config) learnt = False for i in range(num_iterations): result = algo.train() print(result) if result["episode_reward_max"] > 0.0: print("Reached goal after {} iters!".format(i)) learnt = True break algo.stop() self.assertTrue(learnt)
def test_ppo_free_log_std(self): """Tests the free log std option works.""" config = ( ppo.PPOConfig() .rollouts( num_rollout_workers=0, ) .training( gamma=0.99, model=dict( fcnet_hiddens=[10], fcnet_activation="linear", free_log_std=True, vf_share_layers=True, ), ) ) for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPO(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check the free log std var is created. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 1, matching log_std_var = matching[0] def get_value(): if fw == "tf": return policy.get_session().run(log_std_var)[0] elif fw == "torch": return log_std_var.detach().cpu().numpy()[0] else: return log_std_var.numpy()[0] # Check the variable is initially zero. init_std = get_value() assert init_std == 0.0, init_std batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": batch = policy._lazy_tensor_dict(batch) policy.learn_on_batch(batch) # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std trainer.stop()
def test_dont_import_tf_error(): """Check that an error is thrown when tf isn't installed but we try to run a tf experiment. """ # Do not import tf for testing purposes. os.environ["RLLIB_TEST_NO_TF_IMPORT"] = "1" config = {} for _ in framework_iterator(config, frameworks=("tf", "tf2", "tfe")): with pytest.raises(ImportError, match="However, there was no installation found."): ppo.PPO(config, env="CartPole-v1")
def test_traj_view_lstm_prev_actions_and_rewards(self): """Tests, whether Policy/Model return correct LSTM ViewRequirements.""" config = ppo.DEFAULT_CONFIG.copy() config["model"] = config["model"].copy() # Activate LSTM + prev-action + rewards. config["model"]["use_lstm"] = True config["model"]["lstm_use_prev_action"] = True config["model"]["lstm_use_prev_reward"] = True for _ in framework_iterator(config): trainer = ppo.PPO(config, env="CartPole-v0") policy = trainer.get_policy() view_req_model = policy.model.view_requirements view_req_policy = policy.view_requirements # 7=obs, prev-a + r, 2x state-in, 2x state-out. assert len(view_req_model) == 7, view_req_model assert len(view_req_policy) == 20, (len(view_req_policy), view_req_policy) for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS, "advantages", "value_targets", SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP, ]: assert key in view_req_policy if key == SampleBatch.PREV_ACTIONS: assert view_req_policy[key].data_col == SampleBatch.ACTIONS assert view_req_policy[key].shift == -1 elif key == SampleBatch.PREV_REWARDS: assert view_req_policy[key].data_col == SampleBatch.REWARDS assert view_req_policy[key].shift == -1 elif key not in [ SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS, ]: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].shift == 1 trainer.stop()
def test_leaky_env(self): """Tests, whether our diagnostics tools can detect leaks in an env.""" config = ppo.DEFAULT_CONFIG.copy() # Make sure we have an env to test on the local worker. # Otherwise, `check_memory_leaks` will complain. config["create_env_on_driver"] = True config["env"] = MemoryLeakingEnv config["env_config"] = { "static_samples": True, } trainer = ppo.PPO(config=config) results = check_memory_leaks(trainer, to_check={"env"}, repeats=150) assert results["env"] trainer.stop()
def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = ( ppo.PPOConfig() .environment( env_config={"is_slippery": False, "map_name": "4x4"}, ) .rollouts( # Run locally. num_rollout_workers=0, ) ) obs = np.array(0) # Test against all frameworks. for fw in framework_iterator(config): # Default Agent should be setup with StochasticSampling. trainer = ppo.PPO(config=config, env="FrozenLake-v1") # explore=False, always expect the same (deterministic) action. a_ = trainer.compute_single_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0) ) # Test whether this is really the argmax action over the logits. if fw != "tf": last_out = trainer.get_policy().model.last_output() if fw == "torch": check(a_, np.argmax(last_out.detach().cpu().numpy(), 1)[0]) else: check(a_, np.argmax(last_out.numpy(), 1)[0]) for _ in range(50): a = trainer.compute_single_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0), ) check(a, a_) # With explore=True (default), expect stochastic actions. actions = [] for _ in range(300): actions.append( trainer.compute_single_action( obs, prev_action=np.array(2), prev_reward=np.array(1.0) ) ) check(np.mean(actions), 1.5, atol=0.2) trainer.stop()
def test_modelv3(self): config = { "env": "CartPole-v0", "model": { "custom_model": RNNModel, "custom_model_config": { "hiddens_size": 64, "cell_size": 128, }, }, "num_workers": 0, } trainer = ppo.PPO(config=config) for _ in range(2): results = trainer.train() print(results)
def test_preprocessing_disabled(self): config = ppo.DEFAULT_CONFIG.copy() config["seed"] = 42 config["env"] = "ray.rllib.examples.env.random_env.RandomEnv" config["env_config"] = { "config": { "observation_space": Dict({ "a": Discrete(5), "b": Dict({ "ba": Discrete(4), "bb": Box(-1.0, 1.0, (2, 3), dtype=np.float32), }), "c": Tuple((MultiDiscrete([2, 3]), Discrete(1))), "d": Box(-1.0, 1.0, (1, ), dtype=np.int32), }), }, } # Set this to True to enforce no preprocessors being used. # Complex observations now arrive directly in the model as # structures of batches, e.g. {"a": tensor, "b": [tensor, tensor]} # for obs-space=Dict(a=..., b=Tuple(..., ...)). config["_disable_preprocessor_api"] = True # Speed things up a little. config["train_batch_size"] = 100 config["sgd_minibatch_size"] = 10 config["rollout_fragment_length"] = 5 config["num_sgd_iter"] = 1 num_iterations = 1 # Only supported for tf so far. for _ in framework_iterator(config): algo = ppo.PPO(config=config) for i in range(num_iterations): results = algo.train() check_train_results(results) print(results) check_compute_single_action(algo) algo.stop()
def test_counting_by_agent_steps(self): config = copy.deepcopy(ppo.DEFAULT_CONFIG) num_agents = 3 config["num_workers"] = 2 config["num_sgd_iter"] = 2 config["framework"] = "torch" config["rollout_fragment_length"] = 21 config["train_batch_size"] = 147 config["multiagent"] = { "policies": {f"p{i}" for i in range(num_agents)}, "policy_mapping_fn": lambda aid, **kwargs: "p{}".format(aid), "count_steps_by": "agent_steps", } # Env setup. config["env"] = MultiAgentPendulum config["env_config"] = {"num_agents": num_agents} num_iterations = 2 trainer = ppo.PPO(config=config) results = None for i in range(num_iterations): results = trainer.train() self.assertEqual(results["agent_timesteps_total"], results["timesteps_total"]) self.assertEqual( results["num_env_steps_trained"] * num_agents, results["num_agent_steps_trained"], ) self.assertGreaterEqual( results["agent_timesteps_total"], num_iterations * config["train_batch_size"], ) self.assertLessEqual( results["agent_timesteps_total"], (num_iterations + 1) * config["train_batch_size"], ) trainer.stop()
def test_traj_view_attention_net(self): config = ppo.DEFAULT_CONFIG.copy() # Setup attention net. config["model"] = config["model"].copy() config["model"]["max_seq_len"] = 50 config["model"]["custom_model"] = GTrXLNet config["model"]["custom_model_config"] = { "num_transformer_units": 1, "attention_dim": 64, "num_heads": 2, "memory_inference": 50, "memory_training": 50, "head_dim": 32, "ff_hidden_dim": 32, } # Test with odd batch numbers. config["train_batch_size"] = 1031 config["sgd_minibatch_size"] = 201 config["num_sgd_iter"] = 5 config["num_workers"] = 0 config["callbacks"] = MyCallbacks config["env_config"] = { "config": { "start_at_t": 1 } } # first obs is [1.0] for _ in framework_iterator(config, frameworks="tf2"): trainer = ppo.PPO( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv", ) rw = trainer.workers.local_worker() sample = rw.sample() assert sample.count == trainer.config["rollout_fragment_length"] results = trainer.train() assert results["timesteps_total"] == config["train_batch_size"] trainer.stop()
} # use stop conditions passed via CLI (or defaults) stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } # manual training loop using PPO without tune.run() if args.no_tune: if args.run != "PPO": raise ValueError("Only support --run PPO with --no-tune.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) algo = ppo.PPO(config=ppo_config, env=CorrelatedActionsEnv) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = algo.train() print(pretty_print(result)) # stop training if the target train steps or reward are reached if (result["timesteps_total"] >= args.stop_timesteps or result["episode_reward_mean"] >= args.stop_reward): break # run manual test loop: 1 iteration until done print("Finished training. Running manual test/inference loop.") env = CorrelatedActionsEnv(_) obs = env.reset() done = False total_reward = 0
self.action_space = gym.spaces.Discrete(2) # right/left self.observation_space = gym.spaces.Discrete(self.end_pos) def reset(self): self.cur_pos = 0 return self.cur_pos def step(self, action): if action == 0 and self.cur_pos > 0: # move right (towards goal) self.cur_pos -= 1 elif action == 1: # move left (towards start) self.cur_pos += 1 if self.cur_pos >= self.end_pos: return 0, 1.0, True, {} else: return self.cur_pos, -0.1, False, {} ray.init() config = { "env": SimpleCorridor, "env_config": { "corridor_length": 5, }, } algo = ppo.PPO(config=config) for _ in range(3): print(algo.train()) # __rllib-custom-gym-env-end__
config["framework"] = "tf" outdir = "export_tf" if os.path.exists(outdir): shutil.rmtree(outdir) np.random.seed(1234) # We will run inference with this test batch test_data = { "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32), } # Start Ray and initialize a PPO Algorithm. ray.init() algo = ppo.PPO(config=config, env="CartPole-v0") # You could train the model here # algo.train() # Let's run inference on the tensorflow model policy = algo.get_policy() result_tf, _ = policy.model(test_data) # Evaluate tensor to fetch numpy array with policy._sess.as_default(): result_tf = result_tf.eval() # This line will export the model to ONNX res = algo.export_policy_model(outdir, onnx=11)
def value_function(self): return torch.from_numpy(np.zeros(shape=(self._last_batch_size, ))) if __name__ == "__main__": ray.init() # Register the above custom model. ModelCatalog.register_custom_model("my_torch_model", MyCustomModel) # Create the Trainer. trainer = ppo.PPO( env="CartPole-v0", config={ "framework": "torch", "model": { # Auto-wrap the custom(!) model with an LSTM. "use_lstm": True, # To further customize the LSTM auto-wrapper. "lstm_cell_size": 64, # Specify our custom model from above. "custom_model": "my_torch_model", # Extra kwargs to be passed to your model's c'tor. "custom_model_config": {}, }, }, ) trainer.train() # __sphinx_doc_end__
self.action_space = gym.spaces.Discrete(2) # right/left self.observation_space = gym.spaces.Discrete(self.end_pos) def reset(self): self.cur_pos = 0 return self.cur_pos def step(self, action): if action == 0 and self.cur_pos > 0: # move right (towards goal) self.cur_pos -= 1 elif action == 1: # move left (towards start) self.cur_pos += 1 if self.cur_pos >= self.end_pos: return 0, 1.0, True, {} else: return self.cur_pos, -0.1, False, {} ray.init() config = { "env": SimpleCorridor, "env_config": { "corridor_length": 5, }, } trainer = ppo.PPO(config=config) for _ in range(3): print(trainer.train()) # __rllib-custom-gym-env-end__
"eager_tracing": args.eager_tracing, } stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } # manual training loop (no Ray tune) if args.no_tune: if args.run not in {"APPO", "PPO"}: raise ValueError("This example only supports APPO and PPO.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) trainer = ppo.PPO(config=ppo_config, env=ActionMaskEnv) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() print(pretty_print(result)) # stop training if the target train steps or reward are reached if (result["timesteps_total"] >= args.stop_timesteps or result["episode_reward_mean"] >= args.stop_reward): break # manual test loop print("Finished training. Running manual test/inference loop.") # prepare environment with max 10 steps config["env_config"]["max_episode_len"] = 10 env = ActionMaskEnv(config["env_config"]) obs = env.reset()
stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } if args.no_tune: # manual training with train loop using PPO and fixed learning rate if args.run != "PPO": raise ValueError("Only support --run PPO with --no-tune.") print("Running manual train loop without Ray Tune.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) # use fixed learning rate instead of grid search (needs tune) ppo_config["lr"] = 1e-3 trainer = ppo.PPO(config=ppo_config, env=SimpleCorridor) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = trainer.train() print(pretty_print(result)) # stop training of the target train steps or reward are reached if ( result["timesteps_total"] >= args.stop_timesteps or result["episode_reward_mean"] >= args.stop_reward ): break else: # automated run with Tune and grid search and TensorBoard print("Training automatically with Ray Tune") results = tune.run(args.run, config=config, stop=stop)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = (ppo.PPOConfig().rollouts(num_rollout_workers=0, ).training( gamma=0.99, model=dict( fcnet_hiddens=[10], fcnet_activation="linear", vf_share_layers=True, ), )) for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPO(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 0, matching # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw in ["tf2", "tfe"]: PPOTF2Policy.loss(policy, policy.model, Categorical, train_batch) elif fw == "torch": PPOTorchPolicy.loss(policy, policy.model, policy.dist_class, train_batch) vars = (policy.model.variables() if fw != "torch" else list(policy.model.parameters())) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc( train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw, ) expected_logits = fc( expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw, ) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess, ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy._mean_kl_loss, policy._mean_entropy, policy._mean_policy_loss, policy._mean_vf_loss, policy._total_loss, ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False), ) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) elif fw == "torch": check(policy.model.tower_stats["mean_kl_loss"], kl) check(policy.model.tower_stats["mean_entropy"], entropy) check(policy.model.tower_stats["mean_policy_loss"], np.mean(-pg_loss)) check( policy.model.tower_stats["mean_vf_loss"], np.mean(vf_loss), decimals=4, ) check(policy.model.tower_stats["total_loss"], overall_loss, decimals=4) else: check(policy._mean_kl_loss, kl) check(policy._mean_entropy, entropy) check(policy._mean_policy_loss, np.mean(-pg_loss)) check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy._total_loss, overall_loss, decimals=4) trainer.stop()
outdir = "export_torch" if os.path.exists(outdir): shutil.rmtree(outdir) np.random.seed(1234) # We will run inference with this test batch test_data = { "obs": np.random.uniform(0, 1.0, size=(10, 4)).astype(np.float32), "state_ins": np.array([0.0], dtype=np.float32), } # Start Ray and initialize a PPO trainer ray.init() trainer = ppo.PPO(config=config, env="CartPole-v0") # You could train the model here # trainer.train() # Let's run inference on the torch model policy = trainer.get_policy() result_pytorch, _ = policy.model( { "obs": torch.tensor(test_data["obs"]), } ) # Evaluate tensor to fetch numpy array result_pytorch = result_pytorch.detach().numpy()
"PPO", stop={"timesteps_total": train_steps}, config={ "env": env_name, "lr": learning_rate }, checkpoint_at_end=True, local_dir=save_dir, ) # retrieve the checkpoint path analysis.default_metric = "episode_reward_mean" analysis.default_mode = "max" checkpoint_path = analysis.get_best_checkpoint(trial=analysis.get_best_trial()) print(f"Trained model saved at {checkpoint_path}") # load and restore model agent = ppo.PPO(env=env_name) agent.restore(checkpoint_path) print(f"Agent loaded from saved model at {checkpoint_path}") # inference env = gym.make(env_name) obs = env.reset() for i in range(1000): action = agent.compute_single_action(obs) obs, reward, done, info = env.step(action) env.render() if done: print(f"Cart pole dropped after {i} steps.") break
} stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } # Manual training loop (no Ray tune). if args.no_tune: # manual training loop using PPO and manually keeping track of state if args.run != "PPO": raise ValueError("Only support --run PPO with --no-tune.") ppo_config = ppo.DEFAULT_CONFIG.copy() ppo_config.update(config) algo = ppo.PPO(config=ppo_config, env=args.env) # run manual training loop and print results after each iteration for _ in range(args.stop_iters): result = algo.train() print(pretty_print(result)) # stop training if the target train steps or reward are reached if (result["timesteps_total"] >= args.stop_timesteps or result["episode_reward_mean"] >= args.stop_reward): break # Run manual test loop (only for RepeatAfterMe env). if args.env == "RepeatAfterMeEnv": print("Finished training. Running manual test/inference loop.") # prepare env env = RepeatAfterMeEnv(config["env_config"]) obs = env.reset()