def test_preprocessing_disabled(self): config = ppo.DEFAULT_CONFIG.copy() config["seed"] = 42 config["env"] = "ray.rllib.examples.env.random_env.RandomEnv" config["env_config"] = { "config": { "observation_space": Dict({ "a": Discrete(5), "b": Dict({ "ba": Discrete(4), "bb": Box(-1.0, 1.0, (2, 3), dtype=np.float32), }), "c": Tuple((MultiDiscrete([2, 3]), Discrete(1))), "d": Box(-1.0, 1.0, (1, ), dtype=np.int32), }), }, } # Set this to True to enforce no preprocessors being used. # Complex observations now arrive directly in the model as # structures of batches, e.g. {"a": tensor, "b": [tensor, tensor]} # for obs-space=Dict(a=..., b=Tuple(..., ...)). config["_disable_preprocessor_api"] = True # Speed things up a little. config["train_batch_size"] = 100 config["sgd_minibatch_size"] = 10 config["rollout_fragment_length"] = 5 config["num_sgd_iter"] = 1 num_iterations = 1 # Only supported for tf so far. for _ in framework_iterator(config): trainer = ppo.PPOTrainer(config=config) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) trainer.stop()
def test_traj_view_lstm_prev_actions_and_rewards(self): """Tests, whether Policy/Model return correct LSTM ViewRequirements. """ config = ppo.DEFAULT_CONFIG.copy() config["model"] = config["model"].copy() # Activate LSTM + prev-action + rewards. config["model"]["use_lstm"] = True config["model"]["lstm_use_prev_action"] = True config["model"]["lstm_use_prev_reward"] = True for _ in framework_iterator(config): trainer = ppo.PPOTrainer(config, env="CartPole-v0") policy = trainer.get_policy() view_req_model = policy.model.view_requirements view_req_policy = policy.view_requirements # 7=obs, prev-a + r, 2x state-in, 2x state-out. assert len(view_req_model) == 7, view_req_model assert len(view_req_policy) == 20,\ (len(view_req_policy), view_req_policy) for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS, "advantages", "value_targets", SampleBatch.ACTION_DIST_INPUTS, SampleBatch.ACTION_LOGP ]: assert key in view_req_policy if key == SampleBatch.PREV_ACTIONS: assert view_req_policy[key].data_col == SampleBatch.ACTIONS assert view_req_policy[key].shift == -1 elif key == SampleBatch.PREV_REWARDS: assert view_req_policy[key].data_col == SampleBatch.REWARDS assert view_req_policy[key].shift == -1 elif key not in [ SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS ]: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].shift == 1 trainer.stop()
def get_rl_agent(agent_name, config, env_to_agent): if agent_name == A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) else: raise Exception("Not valid agent name") return agent
def test(self, algo, path, lr, fc_hid, fc_act): """Test trained agent for a single episode. Return the episode reward""" # instantiate env class unused_shared = [] unused_own = [] unsatisfied_shared = [] unsatisfied_own = [] episode_reward = 0 self.config_test["num_workers"] = 0 self.config_test["lr"] = lr self.config_test['model']["fcnet_hiddens"] = fc_hid self.config_test['model']["fcnet_activation"] = fc_act if algo == "ppo": self.agent = ppo.PPOTrainer(config=self.config_test) if algo == "ddpg": self.agent = ddpg.DDPGTrainer(config=self.config_test) if algo == "a3c": self.agent = a3c.A3CTrainer(config=self.config_test) if algo == "td3": self.agent = ddpg.TD3Trainer(config=self.config_test) if algo == "appo": self.agent = ppo.APPOTrainer(config=self.config_test) self.agent.restore(path) env = self.agent.workers.local_worker().env obs = env.reset() done = False while not done: action = self.agent.compute_action(obs) obs, reward, done, info = env.step(action) episode_reward += reward unused_shared.append(info["unused_shared"]) unused_own.append(info["unused_own"]) unsatisfied_shared.append(info["unsatisfied_shared"]) unsatisfied_own.append(info["unsatisfied_own"]) return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
def get_rllib_agent(agent_name, env_name, env, env_to_agent): config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {} if agent_name == RLLIB_A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) return agent
def test_ppo_exploration_setup(self): """Tests, whether PPO runs with different exploration setups.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs = np.array(0) # Test against all frameworks. for fw in framework_iterator(config): # Default Agent should be setup with StochasticSampling. trainer = ppo.PPOTrainer(config=config, env="FrozenLake-v0") # explore=False, always expect the same (deterministic) action. a_ = trainer.compute_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0)) # Test whether this is really the argmax action over the logits. if fw != "tf": last_out = trainer.get_policy().model.last_output() if fw == "torch": check(a_, np.argmax(last_out.detach().cpu().numpy(), 1)[0]) else: check(a_, np.argmax(last_out.numpy(), 1)[0]) for _ in range(50): a = trainer.compute_action( obs, explore=False, prev_action=np.array(2), prev_reward=np.array(1.0)) check(a, a_) # With explore=True (default), expect stochastic actions. actions = [] for _ in range(300): actions.append( trainer.compute_action( obs, prev_action=np.array(2), prev_reward=np.array(1.0))) check(np.mean(actions), 1.5, atol=0.2) trainer.stop()
def test_counting_by_agent_steps(self): """Test whether a PPOTrainer can be built with all frameworks.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) num_agents = 3 config["num_workers"] = 2 config["num_sgd_iter"] = 2 config["framework"] = "torch" config["rollout_fragment_length"] = 21 config["train_batch_size"] = 147 config["multiagent"] = { "policies": {f"p{i}" for i in range(num_agents)}, "policy_mapping_fn": lambda aid, **kwargs: "p{}".format(aid), "count_steps_by": "agent_steps", } # Env setup. config["env"] = MultiAgentPendulum config["env_config"] = {"num_agents": num_agents} num_iterations = 2 trainer = ppo.PPOTrainer(config=config) results = None for i in range(num_iterations): results = trainer.train() self.assertEqual(results["agent_timesteps_total"], results["timesteps_total"]) self.assertEqual( results["num_env_steps_trained"] * num_agents, results["num_agent_steps_trained"], ) self.assertGreaterEqual( results["agent_timesteps_total"], num_iterations * config["train_batch_size"], ) self.assertLessEqual( results["agent_timesteps_total"], (num_iterations + 1) * config["train_batch_size"], ) trainer.stop()
def load_trained_agent(new_checkpoint): # Previous trainer prev_trainer = ppo.PPOTrainer(env=DummyTrainer, config={ "env_config": {}, "framework": "torch", "num_gpus": 0, "num_workers": 0, "explore": False }) # restore an older model for the previous trainer prev_checkpoint_index = new_checkpoint try: prev_trainer.restore( f"models/checkpoint_{prev_checkpoint_index}/checkpoint-{prev_checkpoint_index}" ) except FileNotFoundError: return None return prev_trainer.workers.local_worker().get_policy()
def setup(self, config): path1 = config["path"] path_invariant = config["path_invariant"] batch_size = config["batch_size"] train_data = GridSearchDataset() val_data = GridSearchDataset() train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_data, batch_size=batch_size) invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) invariant_model.load_state_dict( torch.load( path_invariant, map_location=torch.device('cpu'))) # load the invariant model invariant_model.cuda() config = get_PPO_config(1234) trainer = ppo.PPOTrainer(config=config) trainer.restore(path1) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential( policy) # load the agent model sequential_nn.cuda() model = sequential_nn optimizer = torch.optim.Adam(model.parameters(), lr=config.get("lr", 1e-3)) loss = RetrainLoss(invariant_model) # torch.nn.MSELoss() self.models, self.optimizer, self.criterion = self.register( models=[model, invariant_model], optimizers=optimizer, criterion=loss) self.model = self.models[0] self.register_data(train_loader=train_loader, validation_loader=val_loader)
def get_PPO_trainer(use_gpu=1): ModelCatalog.register_custom_model("my_model", TorchCustomModel) config = {"env": allCars, # "model": {"custom_model": "my_model", "fcnet_hiddens": [64, 64], "fcnet_activation": "relu"}, # model config," "vf_share_layers": False, "lr": 5e-4, "num_gpus": use_gpu, "vf_clip_param": 100000, "grad_clip": 2500, "num_workers": 8, # parallelism "batch_mode": "complete_episodes", "evaluation_interval": 10, "use_gae": True, # "lambda": 0.95, # gae lambda param "num_envs_per_worker": 10, "train_batch_size": 4000, "evaluation_num_episodes": 20, "rollout_fragment_length": 1000, "framework": "torch", "horizon": 100} trainer = ppo.PPOTrainer(config=config) return config, trainer
def test_traj_view_attention_net(self): config = ppo.DEFAULT_CONFIG.copy() # Setup attention net. config["model"] = config["model"].copy() config["model"]["max_seq_len"] = 50 config["model"]["custom_model"] = GTrXLNet config["model"]["custom_model_config"] = { "num_transformer_units": 1, "attention_dim": 64, "num_heads": 2, "memory_inference": 50, "memory_training": 50, "head_dim": 32, "ff_hidden_dim": 32, } # Test with odd batch numbers. config["train_batch_size"] = 1031 config["sgd_minibatch_size"] = 201 config["num_sgd_iter"] = 5 config["num_workers"] = 0 config["callbacks"] = MyCallbacks config["env_config"] = { "config": { "start_at_t": 1 } } # first obs is [1.0] for _ in framework_iterator(config, frameworks="tf2"): trainer = ppo.PPOTrainer( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv", ) rw = trainer.workers.local_worker() sample = rw.sample() assert sample.count == config["rollout_fragment_length"] results = trainer.train() assert results["train_batch_size"] == config["train_batch_size"] trainer.stop()
def train(): ray.init() config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["env_config"] = { "history_len": 10, "features": "sent latency inflation,latency ratio,send ratio" } config["num_workers"] = 6 config["eager"] = False config["log_level"] = "INFO" config["monitor"] = True config["num_cpus_per_worker"] = 0 trainer = ppo.PPOTrainer(config=config, env=SimulatedNetworkEnv) for i in range(1000): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if i % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
def _continuous_run(self): import ray from ray import tune from ray.rllib.agents import ppo,ddpg ray.init(num_cpus=4,num_gpus=1,local_mode=True) configs={ 'num_gpus':1, 'num_workers':4, # 'num_gpus_per_worker':1, 'framework':'torch', "simple_optimizer":True, } AGENT_CONFIG={ 'ddpg':ddpg.DDPGTrainer(config=configs,env="MountainCarContinuous-v0"), 'ppo':ppo.PPOTrainer(config=configs,env="MountainCarContinuous-v0"), } trainer=AGENT_CONFIG[self.configs['algorithm']] # tune.run(agent, config={"env": "MountainCarContinuous-v0","framework":"torch","num_gpus":0,}) for i in range(2000): # 2000epoch result=trainer.train()#1 epoch print(result) return
def solve(): length = 20 ray.init() trainer = ppo.PPOTrainer(env=CorridorEnv, config={"env_config": { "length": length }}) while True: results = trainer.train() training_iteration = results.get("training_iteration") episode_length_mean = results.get("episode_len_mean") episodes_total = results.get("episodes_total") total_time = results.get("time_total_s") print("\n============") print(pretty_print(results)) print(f"\nCorridorEnv (length: {length}) " f"Training Iteration {training_iteration}:" f"\n\tIteration episode mean length: {episode_length_mean}" f"\n\tEpisodes total: {episodes_total}" f"\n\tTime total: {round(total_time, 1)}sec") if episode_length_mean <= length: break print(f"\nProblem solved in {episodes_total} training episodes")
def test_plain(self): config = ppo.DEFAULT_CONFIG.copy() for _ in framework_iterator(config, frameworks="torch"): trainer = ppo.PPOTrainer(config, env="CartPole-v0") policy = trainer.get_policy() view_req_model = policy.model.inference_view_requirements() view_req_policy = policy.training_view_requirements() assert len(view_req_model) == 1 assert len(view_req_policy) == 6 for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.VF_PREDS ]: assert key in view_req_policy # None of the view cols has a special underlying data_col, # except next-obs. if key != SampleBatch.NEXT_OBS: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].shift == 1 trainer.stop()
def training_PPO(start_train_date, end_train_date, resume, diff_days): config = ppo.DEFAULT_CONFIG.copy() config["observation_filter"] = 'MeanStdFilter' config["batch_mode"] = "complete_episodes" config["lr"] = 1e-4 config["num_workers"] = num_cores config["env_config"] = { "settings": settings, "main_path": curr_path, "start_train": start_train_date, "end_train": end_train_date, "train/test": "train", "sc_volt_start_train": sc_volt_train, "diff_days": diff_days, "GT_hour_start": 0, } trainer = ppo.PPOTrainer(config=config, env="simplePible") if resume_path != "": print("Restoring checkpoint: ", resume) sleep(5) trainer.restore( resume ) # Can optionally call trainer.restore(path) to load a checkpoint. for i in range(0, int(settings[0]["training_iterations"])): result = trainer.train() print(pretty_print(result)) if int(result["training_iteration"]) % 10 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) checkp_split = checkpoint.split('/') parent_dir = '/'.join(checkp_split[0:-2]) # Remove previous agents and save bew agetn into Agents_Saved Ember_RL_func.rm_old_save_new_agent(parent_dir, save_agent_folder)
def test_lstm_prev_actions_and_rewards(self): config = ppo.DEFAULT_CONFIG.copy() config["model"] = config["model"].copy() # Activate LSTM + prev-action + rewards. config["model"]["use_lstm"] = True config["model"]["lstm_use_prev_action_reward"] = True for _ in framework_iterator(config, frameworks="torch"): trainer = ppo.PPOTrainer(config, env="CartPole-v0") policy = trainer.get_policy() view_req_model = policy.model.inference_view_requirements() view_req_policy = policy.training_view_requirements() assert len(view_req_model) == 3 # obs, prev_a, prev_r assert len(view_req_policy) == 8 for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS ]: assert key in view_req_policy if key == SampleBatch.PREV_ACTIONS: assert view_req_policy[key].data_col == SampleBatch.ACTIONS assert view_req_policy[key].shift == -1 elif key == SampleBatch.PREV_REWARDS: assert view_req_policy[key].data_col == SampleBatch.REWARDS assert view_req_policy[key].shift == -1 elif key not in [ SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS ]: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].shift == 1 trainer.stop()
def test_ppo_compilation_and_lr_schedule(self): """Test whether a PPOTrainer can be built with all frameworks.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) # For checking lr-schedule correctness. config["callbacks"] = MyCallbacks config["num_workers"] = 1 config["num_sgd_iter"] = 2 # Settings in case we use an LSTM. config["model"]["lstm_cell_size"] = 10 config["model"]["max_seq_len"] = 20 # Use default-native keras models whenever possible. config["model"]["_use_default_native_models"] = True config["train_batch_size"] = 128 # Test with compression. config["compress_observations"] = True num_iterations = 2 for _ in framework_iterator(config): for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]: print("Env={}".format(env)) for lstm in [True, False]: print("LSTM={}".format(lstm)) config["model"]["use_lstm"] = lstm config["model"]["lstm_use_prev_action"] = lstm config["model"]["lstm_use_prev_reward"] = lstm trainer = ppo.PPOTrainer(config=config, env=env) for i in range(num_iterations): trainer.train() check_compute_single_action( trainer, include_prev_action_reward=True, include_state=lstm) trainer.stop()
def test_no_curiosity(self): config = ppo.DEFAULT_CONFIG env = "CartPole-v0" dummy_obs = np.array([0.0, 0.1, 0.0, 0.0]) prev_a = np.array(0) config["framework"] = "torch" config["exploration_config"] = {"type": "ParameterNoise"} trainer = ppo.PPOTrainer(config=config, env=env) trainer.train() # Make sure all actions drawn are the same, given same # observations. Tests the explorations API. actions = [] for _ in range(5): actions.append( trainer.compute_action( observation=dummy_obs, explore=False, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None)) check(actions[-1], actions[0]) print(actions)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["model"]["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 0, matching # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw in ["tf2", "tfe"]: ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy._mean_kl, policy._mean_entropy, policy._mean_policy_loss, policy._mean_vf_loss, policy._total_loss, ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) else: check(policy._mean_kl, kl) check(policy._mean_entropy, entropy) check(policy._mean_policy_loss, np.mean(-pg_loss)) check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy._total_loss, overall_loss, decimals=4) trainer.stop()
import logging.config import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) helpers.register_madras() ray.init() config = ppo.DEFAULT_CONFIG.copy() # Full config is here: https://github.com/ray-project/ray/blob/d51583dbd6dc9c082764b9ec06349678aaa71078/rllib/agents/trainer.py#L42 config["num_gpus"] = 0 config["num_workers"] = 1 config["eager"] = False config[ "vf_clip_param"] = 20 # originally it was 10. We should consider scaling down the rewards for keeping episode reward under 2000 # config["gamma"] = 0.7 # config["lr"] = 5e-7 # config["batch_mode"] = "complete_episodes" # config["train_batch_size"] = 10000 trainer = ppo.PPOTrainer(config=config, env="madras_env") # Can optionally call trainer.restore(path) to load a checkpoint. for i in range(10000): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if i % 10 == 0: checkpoint = trainer.save() logging.info("checkpoint saved at", checkpoint)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["vf_share_layers"] = True # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array( [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), SampleBatch.ACTION_DIST_INPUTS: np.array( [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), SampleBatch.ACTION_LOGP: np.array( [-0.5, -0.1, -0.2], dtype=np.float32), } for fw in ["tf", "torch"]: print("framework={}".format(fw)) config["use_pytorch"] = fw == "torch" config["eager"] = fw == "tf" trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] if fw == "tf": train_batch = postprocess_ppo_gae_tf(policy, train_batch) else: train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) # for tf. if fw == "tf": ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) else: ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw == "tf" else \ list(policy.model.parameters()) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0], vars[1]) expected_logits = fc(expected_shared_out, vars[2], vars[3]) expected_value_outs = fc(expected_shared_out, vars[4], vars[5]) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw == "tf" else TorchCategorical, train_batch, expected_logits, expected_value_outs ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)
def _main(): """ Training loop """ # Args logger.info('Arguments: %s', str(ARGS)) # Results metrics_dir, checkpoint_dir, best_checkpoint_dir, debug_dir, eval_dir = results_handler( ARGS) # Initialize the simulation. # ray.init() # ray.init(memory=52428800, object_store_memory=78643200) ## minimum values # ray.init(address='auto', _redis_password='******') ## attach ray.init(num_cpus=ARGS.ray_cpus, num_gpus=ARGS.ray_gpus, _memory=ARGS.ray_mem_gb * GB, object_store_memory=ARGS.ray_store_gb * GB) # Load default Scenario configuration experiment_config = load_json_file(ARGS.config) # Associate the agents with something env_config = { 'metrics_dir': metrics_dir, 'checkpoint_dir': checkpoint_dir, 'agent_init': load_json_file(experiment_config['agents_init']), 'scenario_config': experiment_config['marl_env_config'], } ## fix the config for learning: env_config['agent_init']['eval'] = {} marl_env = None if ARGS.env == 'MARL': ray.tune.registry.register_env( 'marl_env', complexparkingstochasticdeepmarlenv.env_creator) marl_env = complexparkingstochasticdeepmarlenv.CSParkingPersuasiveDeepMARLEnv( env_config) else: raise Exception('Unknown environment %s' % ARGS.env) # Persuasive A3C Algorithm. policy_class = ppo.PPOTFPolicy policy_conf = persuasive_ppo_conf(rollout_size=ARGS.rollout_size, agents=len(marl_env.get_agents()), debug_folder=debug_dir, eval_folder=eval_dir, alpha=ARGS.alpha, gamma=ARGS.gamma) # Gen config agent = marl_env.get_agents()[0] policies = { 'unique': (policy_class, marl_env.get_obs_space(agent), marl_env.get_action_space(agent), {}) } policy_conf['multiagent']['policies'] = policies policy_conf['multiagent']['policy_mapping_fn'] = lambda agent_id: 'unique' policy_conf['env_config'] = env_config # policy_conf['evaluation_config']['env_config'] = { # 'metrics_dir': metrics_dir, # 'checkpoint_dir': checkpoint_dir, # 'agent_init': load_json_file(experiment_config['agents_init']), # 'scenario_config': experiment_config['marl_env_config'], # } logger.info('Configuration: \n%s', pformat(policy_conf)) def default_logger_creator(config): """ Creates a Unified logger with a default logdir prefix containing the agent name and the env id """ log_dir = os.path.join(os.path.normpath(ARGS.dir), 'logs') if not os.path.exists(log_dir): os.makedirs(log_dir) return UnifiedLogger(config, log_dir, loggers=[NoopLogger]) trainer = ppo.PPOTrainer( # env=deepmarlenvironment.PersuasiveDeepMARLEnv, env='marl_env', config=policy_conf, logger_creator=default_logger_creator) last_checkpoint = get_last_checkpoint(checkpoint_dir) if last_checkpoint is not None: trainer.restore(last_checkpoint) logger.info('Restored checkpoint: %s', last_checkpoint) # Restoring the latest best metrics for metric in CHECKPOINT_METRICS: CURRENT_METRICS[metric]['value'] = get_last_best_of( os.path.join(best_checkpoint_dir, metric)) logger.info('Restored metrics: \n%s', pformat(CURRENT_METRICS)) counter = 0 unchanged_window = 0 final_result = None while counter < ARGS.training_iterations: # Do one training step. result = trainer.train() checkpoint = trainer.save(checkpoint_dir) logger.info('Checkpoint saved in %s', checkpoint) counter = result['iterations_since_restore'] # counter = result['training_iteration'] # steps += result['info']['num_steps_trained'] # steps += result['timesteps_this_iter'] final_result = result print_selected_results(result, SELECTION) metric_file = os.path.join( metrics_dir, 'metrics_{}.json'.format(result['training_iteration'])) with open(metric_file, 'w') as fstream: # the evaluation metrica are not saved in 'results.json' json.dump(result, fstream, cls=NPEncoder) # fstream.write('\n') print( '############################# METRIC SAVED #############################' ) ############################################################################################ if 'evaluation' not in result: continue changes = False for metric in CHECKPOINT_METRICS: old = CURRENT_METRICS[metric]['value'] new = CURRENT_METRICS[metric]['get'](result) # if np.isnan(new): # pprint(result['evaluation']) # raise Exception(metric, old, new) if CURRENT_METRICS[metric]['check'](new, old): # Save the "best" checkout if metric in STOPPING_METRICS: changes = True CURRENT_METRICS[metric]['value'] = new cleanup(os.path.join(best_checkpoint_dir, metric)) current_checkpoint = trainer.save( os.path.join(best_checkpoint_dir, metric)) current_info_file = os.path.join(best_checkpoint_dir, metric, 'info.json') current_value = {'value': str(new)} with open(current_info_file, 'w') as fstream: json.dump(current_value, fstream, indent=4) if old is None: old = -1.0 logger.info('UPDATING %s: %.2f (%.2f). Checkpoint saved in %s', metric, new, old, current_checkpoint) else: logger.info('UNCHANGED %s ---> Best: %.2f - New: %.2f', metric, old, new) if changes: unchanged_window = 0 else: unchanged_window += 1 logger.info( 'Nothing has changed for the last %d training runs in the monitored metrics [%s].', unchanged_window, str(STOPPING_METRICS)) if unchanged_window >= 10: break ############################################################################################ # pprint(final_result) print_selected_results(final_result, SELECTION)
env_params['res_nutilda_tol'] = 1.0e-4 env_params['reward_type'] = 1 # 1: cl/cd, 2: cd env_params['states_type'] = 1 # 1: single state, 2: k states history env_params['vx'] = 25.75 data = np.loadtxt('control_points_range.csv', delimiter=',', skiprows=1, usecols=range(1, 4)) env_params['controlparams_low'] = data[:, 1] env_params['controlparams_high'] = data[:, 2] config["env_config"] = env_params # Trainer # trainer = appo.APPOTrainer(config=config, env="myenv") trainer = ppo.PPOTrainer(config=config, env="myenv") # trainer = a3c.A3CTrainer(config=config, env="myenv") # trainer = a2c.A2CTrainer(config=config, env="myenv") trainer_time = time() # trainer.restore('./PPO_myenv_2020-07-23_18-53-59cb7gh16j/checkpoint_51/checkpoint-51') file_results = 'Training_iterations_ppo.txt' # Can optionally call trainer.restore(path) to load a checkpoint. result = {'episodes_total': 0} results = [] with open(file_results, 'wb', 0) as f: # for i in range(ncount): i = 0 while result['episodes_total'] <= 3000:
def test_traj_view_simple_performance(self): """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`. """ config = copy.deepcopy(ppo.DEFAULT_CONFIG) action_space = Discrete(2) obs_space = Box(-1.0, 1.0, shape=(700, )) from ray.rllib.examples.env.random_env import RandomMultiAgentEnv from ray.tune import register_env register_env( "ma_env", lambda c: RandomMultiAgentEnv({ "num_agents": 2, "p_done": 0.0, "max_episode_len": 104, "action_space": action_space, "observation_space": obs_space })) config["num_workers"] = 3 config["num_envs_per_worker"] = 8 config["num_sgd_iter"] = 1 # Put less weight on training. policies = { "pol0": (None, obs_space, action_space, {}), } def policy_fn(agent_id): return "pol0" config["multiagent"] = { "policies": policies, "policy_mapping_fn": policy_fn, } num_iterations = 2 for _ in framework_iterator(config, frameworks="torch"): print("w/ traj. view API") config["_use_trajectory_view_api"] = True trainer = ppo.PPOTrainer(config=config, env="ma_env") learn_time_w = 0.0 sampler_perf_w = {} start = time.time() for i in range(num_iterations): out = trainer.train() ts = out["timesteps_total"] sampler_perf_ = out["sampler_perf"] sampler_perf_w = { k: sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts) for k, v in sampler_perf_.items() } delta = out["timers"]["learn_time_ms"] / ts learn_time_w += delta print("{}={}s".format(i, delta)) sampler_perf_w = { k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1) for k, v in sampler_perf_w.items() } duration_w = time.time() - start print("Duration: {}s " "sampler-perf.={} learn-time/iter={}s".format( duration_w, sampler_perf_w, learn_time_w / num_iterations)) trainer.stop() print("w/o traj. view API") config["_use_trajectory_view_api"] = False trainer = ppo.PPOTrainer(config=config, env="ma_env") learn_time_wo = 0.0 sampler_perf_wo = {} start = time.time() for i in range(num_iterations): out = trainer.train() ts = out["timesteps_total"] sampler_perf_ = out["sampler_perf"] sampler_perf_wo = { k: sampler_perf_wo.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts) for k, v in sampler_perf_.items() } delta = out["timers"]["learn_time_ms"] / ts learn_time_wo += delta print("{}={}s".format(i, delta)) sampler_perf_wo = { k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1) for k, v in sampler_perf_wo.items() } duration_wo = time.time() - start print("Duration: {}s " "sampler-perf.={} learn-time/iter={}s".format( duration_wo, sampler_perf_wo, learn_time_wo / num_iterations)) trainer.stop() # Assert `_use_trajectory_view_api` is faster. self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"], sampler_perf_wo["mean_raw_obs_processing_ms"]) self.assertLess(sampler_perf_w["mean_action_processing_ms"], sampler_perf_wo["mean_action_processing_ms"]) self.assertLess(duration_w, duration_wo)
self.action_space = gym.spaces.Discrete(2) # right/left self.observation_space = gym.spaces.Discrete(self.end_pos) def reset(self): self.cur_pos = 0 return self.cur_pos def step(self, action): if action == 0 and self.cur_pos > 0: # move right (towards goal) self.cur_pos -= 1 elif action == 1: # move left (towards start) self.cur_pos += 1 if self.cur_pos >= self.end_pos: return 0, 1.0, True, {} else: return self.cur_pos, -0.1, False, {} ray.init() config = { "env": SimpleCorridor, "env_config": { "corridor_length": 5, }, } trainer = ppo.PPOTrainer(config=config) for _ in range(3): print(trainer.train()) # __rllib-custom-gym-env-end__
steps (list): list of global steps after each episode returns (list): list of total return of each episode """ box = np.ones(self.log_frequency) / self.log_frequency returns_smooth = np.convolve(self.returns[1:], box, mode='same') plt.clf() plt.plot(self.steps[1:], returns_smooth) plt.title('Status Report') plt.ylabel('Return') plt.xlabel('Steps') plt.savefig('returns.png') with open('returns.txt', 'w') as f: for step, value in zip(self.steps[1:], self.returns[1:]): f.write("{}\t{}\n".format(step, value)) if __name__ == '__main__': ray.init() trainer = ppo.PPOTrainer( env=DiamondCollector, config={ 'env_config': {}, # No environment parameters to configure 'framework': 'torch', # Use pytorch instead of tensorflow 'num_gpus': 0, # We aren't using GPUs 'num_workers': 0 # We aren't using parallelism }) while True: print(trainer.train())
"Colisions for feet: ", self.env.env.robot.calc_state()[20], " ", self.env.env.robot.calc_state()[21] ) #returns states, last 2 numbers inticate whther foot is in contact with ground self.dts_taken_so_far += 1 if self.debug: print("Time elapsed in episode: ", self.dts_taken_so_far * self.env.env.scene.dt) print("Number of dt's taken in episode: ", self.dts_taken_so_far) return self.env.step(action) policy = CustomPolicy(env.observation_space, env.action_space, {}) workers = WorkerSet(policy=CustomPolicy, env_creator=lambda c: gym.make("CartPole-v0"), num_workers=10) from ray.tune.registry import register_env register_env("walkerbulletenv", lambda config: WalkerEnv(config)) #trainer = ppo.PPOTrainer(config=config, env="walkerbulletenv") ray.init() config = ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 ckpt_path = "/home/roman/ray_results/PPO_walkerbulletenv_2020-05-28_22-26-581q14o5cv/checkpoint_991/checkpoint-991" #path to saved policy agent = ppo.PPOTrainer(config, env="walkerbulletenv") agent.restore(ckpt_path) # restore agent (policy) from checkpoint policy = agent.workers.local_worker().get_policy() # get the policy
from ray.rllib.agents import ppo from ray.rllib.models import ModelCatalog from tqdm import tqdm from aie import plotting from aie.aie_env import AIEEnv from rl.conf import BASE_PPO_CONF, OUT_DIR from rl.models.tf.fcnet import FCNet # %% ray.init() ModelCatalog.register_custom_model("my_model", FCNet) # %% trainer = ppo.PPOTrainer(config={ **BASE_PPO_CONF, "num_workers": 0, }) ckpt_path = OUT_DIR / 'PPO_AIEEnv_2021-02-19_16-19-44p6xanojq/checkpoint_1777/checkpoint-1777' trainer.restore(str(ckpt_path)) # %% env = AIEEnv({}, force_dense_logging=True) obs = env.reset() for t in tqdm(range(1000)): results = { k: trainer.compute_action( v, policy_id='learned',
"horizon": 2000, "num_gpus": 1, "explore": False #"replay_sequence_length": 5, #"num_workers": 4, #"num_envs_per_worker": 2, } ray.init(local_mode=True) checkpoint_number = 790 env = Herding({"sheep_count": 3 #"agents_layout": "simple" }) agent = ppo.PPOTrainer(config=config, env=HerdingEnvWrapper) agent.restore( rf"C:\Users\Mateusz\ray_results\Herding\Herding\checkpoint_{checkpoint_number}\checkpoint-{checkpoint_number}" ) while True: episode_reward = 0 done = False steps = 0 obs = env.reset() while (not done) and (steps != 300): action = agent.compute_action(obs[0], policy_id="policy") obs, reward, done, info = env.step(np.array([[2, action]])) env.render() episode_reward += reward steps += 1