def load_policy(env, algo, env_name, policy_path=None, coop=False, seed=0, extra_configs={}): if algo == 'ppo': agent = ppo.PPOTrainer( setup_config(env, algo, coop, seed, extra_configs), 'assistive_gym:' + env_name) elif algo == 'sac': agent = sac.SACTrainer( setup_config(env, algo, coop, seed, extra_configs), 'assistive_gym:' + env_name) if policy_path != '': if 'checkpoint' in policy_path: agent.restore(policy_path) else: # Find the most recent policy in the directory directory = os.path.join(policy_path, algo, env_name) files = [ int(f.split('_')[-1]) for f in glob.glob(os.path.join(directory, 'checkpoint_*')) ] if files: checkpoint_num = max(files) checkpoint_path = os.path.join( directory, 'checkpoint_%d' % checkpoint_num, 'checkpoint-%d' % checkpoint_num) agent.restore(checkpoint_path) # return agent, checkpoint_path return agent, None return agent, None
def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["soft_horizon"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True num_iterations = 1 for _ in framework_iterator(config): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ "Pendulum-v0", "MsPacmanNoFrameskip-v4", "CartPole-v0" ]: print("Env={}".format(env)) config["use_state_preprocessor"] = \ env == "MsPacmanNoFrameskip-v4" trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) trainer.stop()
def test_sac_fake_multi_gpu_learning(self): """Test whether SACTrainer can learn CartPole w/ faked multi-GPU.""" config = copy.deepcopy(sac.DEFAULT_CONFIG) # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True config["clip_actions"] = False config["initial_alpha"] = 0.001 config["prioritized_replay"] = True env = "ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv" config["env_config"] = {"config": {"repeat_delay": 0}} for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = sac.SACTrainer(config=config, env=env) num_iterations = 50 learnt = False for i in range(num_iterations): results = trainer.train() print(f"R={results['episode_reward_mean']}") if results["episode_reward_mean"] > 30.0: learnt = True break assert learnt, \ f"SAC multi-GPU (with fake-GPUs) did not learn {env}!" trainer.stop()
def get_PPO_trainer(use_gpu=1): ModelCatalog.register_custom_model("my_model", TorchCustomModel) config = { "env": StoppingCar, # "model": { "custom_model": "my_model", "fcnet_hiddens": [64, 64], "fcnet_activation": "relu" }, # model config," "lr": 5e-4, "num_gpus": use_gpu, # "vf_share_layers": False, # "vf_clip_param": 100000, "grad_clip": 2500, "num_workers": 8, # parallelism "batch_mode": "complete_episodes", "evaluation_interval": 10, # "use_gae": True, # # "lambda": 0.95, # gae lambda param "num_envs_per_worker": 10, "train_batch_size": 4000, "evaluation_num_episodes": 20, "rollout_fragment_length": 1000, "framework": "torch", "horizon": 1000 } # trainer = ppo.PPOTrainer(config=config) trainer = sac.SACTrainer(config=config) return config, trainer
def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["clip_actions"] = False config["normalize_actions"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True config["rollout_fragment_length"] = 10 config["train_batch_size"] = 10 # If we use default buffer size (1e6), the buffer will take up # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021) # available system memory (8.34816 GB). config["buffer_size"] = 40000 num_iterations = 1 ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) ModelCatalog.register_custom_model("batch_norm_torch", TorchBatchNormModel) image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) simple_space = Box(-1.0, 1.0, shape=(3, )) for fw in framework_iterator(config): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ RandomEnv, "MsPacmanNoFrameskip-v4", "CartPole-v0", ]: print("Env={}".format(env)) if env == RandomEnv: config["env_config"] = { "observation_space": Tuple([simple_space, Discrete(2), image_space]), "action_space": Box(-1.0, 1.0, shape=(1, )), } else: config["env_config"] = {} # Test making the Q-model a custom one for CartPole, otherwise, # use the default model. config["Q_model"]["custom_model"] = "batch_norm{}".format( "_torch" if fw == "torch" else "") if env == "CartPole-v0" else None trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) trainer.stop()
def test_sac_dict_obs_order(self): dict_space = Dict({ "img": Box(low=0, high=1, shape=(42, 42, 3)), "cont": Box(low=0, high=100, shape=(3, )), }) # Dict space .sample() returns an ordered dict. # Make sure the keys in samples are ordered differently. dict_samples = [{ k: v for k, v in reversed(dict_space.sample().items()) } for _ in range(10)] class NestedDictEnv(Env): def __init__(self): self.action_space = Box(low=-1.0, high=1.0, shape=(2, )) self.observation_space = dict_space self._spec = EnvSpec("NestedDictEnv-v0") self.steps = 0 def reset(self): self.steps = 0 return dict_samples[0] def step(self, action): self.steps += 1 return dict_samples[self.steps], 1, self.steps >= 5, {} tune.register_env("nested", lambda _: NestedDictEnv()) config = sac.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["learning_starts"] = 0 config["rollout_fragment_length"] = 5 config["train_batch_size"] = 5 config["replay_buffer_config"]["capacity"] = 10 # Disable preprocessors. config["_disable_preprocessor_api"] = True num_iterations = 1 for _ in framework_iterator(config, with_eager_tracing=True): trainer = sac.SACTrainer(env="nested", config=config) for _ in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer)
def test_sac_compilation(self): """Test whether an SACTrainer can be built with all frameworks.""" ray.init() config = sac.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. num_iterations = 1 # eager (discrete and cont. actions). for _ in framework_iterator(config, ["tf", "eager"]): for env in [ "CartPole-v0", "Pendulum-v0", ]: print("Env={}".format(env)) trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() print(results)
def get_rl_agent(agent_name, config, env_to_agent): if agent_name == A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) else: raise Exception("Not valid agent name") return agent
def get_rllib_agent(agent_name, env_name, env, env_to_agent): config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {} if agent_name == RLLIB_A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) return agent
def test_sac_loss_function(self): """Tests SAC loss function results across all frameworks.""" config = sac.DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 config["learning_starts"] = 0 config["twin_q"] = False config["gamma"] = 0.99 # Switch on deterministic loss so we can compare the loss values. config["_deterministic_loss"] = True # Use very simple nets. config["Q_model"]["fcnet_hiddens"] = [10] config["policy_model"]["fcnet_hiddens"] = [10] # Make sure, timing differences do not affect trainer.train(). config["min_iter_time_s"] = 0 map_ = { # Normal net. "default_policy/sequential/action_1/kernel": "action_model." "action_0._model.0.weight", "default_policy/sequential/action_1/bias": "action_model." "action_0._model.0.bias", "default_policy/sequential/action_out/kernel": "action_model." "action_out._model.0.weight", "default_policy/sequential/action_out/bias": "action_model." "action_out._model.0.bias", "default_policy/sequential_1/q_hidden_0/kernel": "q_net." "q_hidden_0._model.0.weight", "default_policy/sequential_1/q_hidden_0/bias": "q_net." "q_hidden_0._model.0.bias", "default_policy/sequential_1/q_out/kernel": "q_net." "q_out._model.0.weight", "default_policy/sequential_1/q_out/bias": "q_net." "q_out._model.0.bias", "default_policy/value_out/kernel": "_value_branch." "_model.0.weight", "default_policy/value_out/bias": "_value_branch." "_model.0.bias", # Target net. "default_policy/sequential_2/action_1/kernel": "action_model." "action_0._model.0.weight", "default_policy/sequential_2/action_1/bias": "action_model." "action_0._model.0.bias", "default_policy/sequential_2/action_out/kernel": "action_model." "action_out._model.0.weight", "default_policy/sequential_2/action_out/bias": "action_model." "action_out._model.0.bias", "default_policy/sequential_3/q_hidden_0/kernel": "q_net." "q_hidden_0._model.0.weight", "default_policy/sequential_3/q_hidden_0/bias": "q_net." "q_hidden_0._model.0.bias", "default_policy/sequential_3/q_out/kernel": "q_net." "q_out._model.0.weight", "default_policy/sequential_3/q_out/bias": "q_net." "q_out._model.0.bias", "default_policy/value_out_1/kernel": "_value_branch." "_model.0.weight", "default_policy/value_out_1/bias": "_value_branch." "_model.0.bias", } env = SimpleEnv batch_size = 100 if env is SimpleEnv: obs_size = (batch_size, 1) actions = np.random.random(size=(batch_size, 1)) elif env == "CartPole-v0": obs_size = (batch_size, 4) actions = np.random.randint(0, 2, size=(batch_size, )) else: obs_size = (batch_size, 3) actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) # Simply compare loss values AND grads of all frameworks with each # other. prev_fw_loss = weights_dict = None expect_c, expect_a, expect_e, expect_t = None, None, None, None # History of tf-updated NN-weights over n training steps. tf_updated_weights = [] # History of input batches used. tf_inputs = [] for fw, sess in framework_iterator(config, frameworks=("tf", "torch"), session=True): # Generate Trainer and get its default Policy object. trainer = sac.SACTrainer(config=config, env=env) policy = trainer.get_policy() p_sess = None if sess: p_sess = policy.get_session() # Set all weights (of all nets) to fixed values. if weights_dict is None: assert fw == "tf" # Start with the tf vars-dict. weights_dict = policy.get_weights() else: assert fw == "torch" # Then transfer that to torch Model. model_dict = self._translate_weights_to_torch( weights_dict, map_) policy.model.load_state_dict(model_dict) policy.target_model.load_state_dict(model_dict) if fw == "tf": log_alpha = weights_dict["default_policy/log_alpha"] elif fw == "torch": # Actually convert to torch tensors. input_ = policy._lazy_tensor_dict(input_) input_ = {k: input_[k] for k in input_.keys()} log_alpha = policy.model.log_alpha.detach().numpy()[0] # Only run the expectation once, should be the same anyways # for all frameworks. if expect_c is None: expect_c, expect_a, expect_e, expect_t = \ self._sac_loss_helper(input_, weights_dict, sorted(weights_dict.keys()), log_alpha, fw, gamma=config["gamma"], sess=sess) # Get actual outs and compare to expectation AND previous # framework. c=critic, a=actor, e=entropy, t=td-error. if fw == "tf": c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \ p_sess.run([ policy.critic_loss, policy.actor_loss, policy.alpha_loss, policy.td_error, policy.optimizer().compute_gradients( policy.critic_loss[0], policy.model.q_variables()), policy.optimizer().compute_gradients( policy.actor_loss, policy.model.policy_variables()), policy.optimizer().compute_gradients( policy.alpha_loss, policy.model.log_alpha)], feed_dict=policy._get_loss_inputs_dict( input_, shuffle=False)) tf_c_grads = [g for g, v in tf_c_grads] tf_a_grads = [g for g, v in tf_a_grads] tf_e_grads = [g for g, v in tf_e_grads] elif fw == "torch": loss_torch(policy, policy.model, None, input_) c, a, e, t = policy.critic_loss, policy.actor_loss, \ policy.alpha_loss, policy.td_error # Test actor gradients. policy.actor_optim.zero_grad() assert all(v.grad is None for v in policy.model.q_variables()) assert all(v.grad is None for v in policy.model.policy_variables()) assert policy.model.log_alpha.grad is None a.backward() # `actor_loss` depends on Q-net vars (but these grads must # be ignored and overridden in critic_loss.backward!). assert not any(v.grad is None for v in policy.model.q_variables()) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.policy_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g)) else: check(tf_g, torch_g) # Test critic gradients. policy.critic_optims[0].zero_grad() assert all( torch.mean(v.grad) == 0.0 for v in policy.model.q_variables()) assert all( torch.min(v.grad) == 0.0 for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None c[0].backward() assert not all( torch.mean(v.grad) == 0 for v in policy.model.q_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g)) else: check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g)) else: check(tf_g, torch_g) # Test alpha gradient. policy.alpha_optim.zero_grad() assert policy.model.log_alpha.grad is None e.backward() assert policy.model.log_alpha.grad is not None check(policy.model.log_alpha.grad, tf_e_grads) check(c, expect_c) check(a, expect_a) check(e, expect_e) check(t, expect_t) # Store this framework's losses in prev_fw_loss to compare with # next framework's outputs. if prev_fw_loss is not None: check(c, prev_fw_loss[0]) check(a, prev_fw_loss[1]) check(e, prev_fw_loss[2]) check(t, prev_fw_loss[3]) prev_fw_loss = (c, a, e, t) # Update weights from our batch (n times). for update_iteration in range(10): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) tf_inputs.append(in_) # Set a fake-batch to use # (instead of sampling from replay buffer). trainer.optimizer._fake_batch = in_ trainer.train() updated_weights = policy.get_weights() # Net must have changed. if tf_updated_weights: check(updated_weights[ "default_policy/sequential/action_1/kernel"], tf_updated_weights[-1] ["default_policy/sequential/action_1/kernel"], false=True) tf_updated_weights.append(updated_weights) # Compare with updated tf-weights. Must all be the same. else: tf_weights = tf_updated_weights[update_iteration] in_ = tf_inputs[update_iteration] # Set a fake-batch to use # (instead of sampling from replay buffer). trainer.optimizer._fake_batch = in_ trainer.train() # Compare updated model. for tf_key in sorted(tf_weights.keys())[2:10]: tf_var = tf_weights[tf_key] torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var), rtol=0.05) else: check(tf_var, torch_var, rtol=0.05) # And alpha. check(policy.model.log_alpha, tf_weights["default_policy/log_alpha"]) # Compare target nets. for tf_key in sorted(tf_weights.keys())[10:18]: tf_var = tf_weights[tf_key] torch_var = policy.target_model.state_dict()[ map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var), rtol=0.05) else: check(tf_var, torch_var, rtol=0.05)
def test_sac_loss_function(self): """Tests SAC loss function results across all frameworks.""" config = sac.DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 config["learning_starts"] = 0 config["twin_q"] = False config["gamma"] = 0.99 # Switch on deterministic loss so we can compare the loss values. config["_deterministic_loss"] = True # Use very simple nets. config["Q_model"]["fcnet_hiddens"] = [10] config["policy_model"]["fcnet_hiddens"] = [10] # Make sure, timing differences do not affect trainer.train(). config["min_iter_time_s"] = 0 # Test SAC with Simplex action space. config["env_config"] = {"simplex_actions": True} map_ = { # Action net. "default_policy/fc_1/kernel": "action_model._hidden_layers.0." "_model.0.weight", "default_policy/fc_1/bias": "action_model._hidden_layers.0." "_model.0.bias", "default_policy/fc_out/kernel": "action_model." "_logits._model.0.weight", "default_policy/fc_out/bias": "action_model._logits._model.0.bias", "default_policy/value_out/kernel": "action_model." "_value_branch._model.0.weight", "default_policy/value_out/bias": "action_model." "_value_branch._model.0.bias", # Q-net. "default_policy/fc_1_1/kernel": "q_net." "_hidden_layers.0._model.0.weight", "default_policy/fc_1_1/bias": "q_net." "_hidden_layers.0._model.0.bias", "default_policy/fc_out_1/kernel": "q_net._logits._model.0.weight", "default_policy/fc_out_1/bias": "q_net._logits._model.0.bias", "default_policy/value_out_1/kernel": "q_net." "_value_branch._model.0.weight", "default_policy/value_out_1/bias": "q_net." "_value_branch._model.0.bias", "default_policy/log_alpha": "log_alpha", # Target action-net. "default_policy/fc_1_2/kernel": "action_model." "_hidden_layers.0._model.0.weight", "default_policy/fc_1_2/bias": "action_model." "_hidden_layers.0._model.0.bias", "default_policy/fc_out_2/kernel": "action_model." "_logits._model.0.weight", "default_policy/fc_out_2/bias": "action_model." "_logits._model.0.bias", "default_policy/value_out_2/kernel": "action_model." "_value_branch._model.0.weight", "default_policy/value_out_2/bias": "action_model." "_value_branch._model.0.bias", # Target Q-net "default_policy/fc_1_3/kernel": "q_net." "_hidden_layers.0._model.0.weight", "default_policy/fc_1_3/bias": "q_net." "_hidden_layers.0._model.0.bias", "default_policy/fc_out_3/kernel": "q_net." "_logits._model.0.weight", "default_policy/fc_out_3/bias": "q_net." "_logits._model.0.bias", "default_policy/value_out_3/kernel": "q_net." "_value_branch._model.0.weight", "default_policy/value_out_3/bias": "q_net." "_value_branch._model.0.bias", "default_policy/log_alpha_1": "log_alpha", } env = SimpleEnv batch_size = 100 obs_size = (batch_size, 1) actions = np.random.random(size=(batch_size, 2)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) # Simply compare loss values AND grads of all frameworks with each # other. prev_fw_loss = weights_dict = None expect_c, expect_a, expect_e, expect_t = None, None, None, None # History of tf-updated NN-weights over n training steps. tf_updated_weights = [] # History of input batches used. tf_inputs = [] for fw, sess in framework_iterator(config, frameworks=("tf", "torch"), session=True): # Generate Trainer and get its default Policy object. trainer = sac.SACTrainer(config=config, env=env) policy = trainer.get_policy() p_sess = None if sess: p_sess = policy.get_session() # Set all weights (of all nets) to fixed values. if weights_dict is None: # Start with the tf vars-dict. assert fw in ["tf2", "tf", "tfe"] weights_dict = policy.get_weights() if fw == "tfe": log_alpha = weights_dict[10] weights_dict = self._translate_tfe_weights( weights_dict, map_) else: assert fw == "torch" # Then transfer that to torch Model. model_dict = self._translate_weights_to_torch( weights_dict, map_) # Have to add this here (not a parameter in tf, but must be # one in torch, so it gets properly copied to the GPU(s)). model_dict["target_entropy"] = policy.model.target_entropy policy.model.load_state_dict(model_dict) policy.target_model.load_state_dict(model_dict) if fw == "tf": log_alpha = weights_dict["default_policy/log_alpha"] elif fw == "torch": # Actually convert to torch tensors (by accessing everything). input_ = policy._lazy_tensor_dict(input_) input_ = {k: input_[k] for k in input_.keys()} log_alpha = policy.model.log_alpha.detach().cpu().numpy()[0] # Only run the expectation once, should be the same anyways # for all frameworks. if expect_c is None: expect_c, expect_a, expect_e, expect_t = \ self._sac_loss_helper(input_, weights_dict, sorted(weights_dict.keys()), log_alpha, fw, gamma=config["gamma"], sess=sess) # Get actual outs and compare to expectation AND previous # framework. c=critic, a=actor, e=entropy, t=td-error. if fw == "tf": c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \ p_sess.run([ policy.critic_loss, policy.actor_loss, policy.alpha_loss, policy.td_error, policy.optimizer().compute_gradients( policy.critic_loss[0], [v for v in policy.model.q_variables() if "value_" not in v.name]), policy.optimizer().compute_gradients( policy.actor_loss, [v for v in policy.model.policy_variables() if "value_" not in v.name]), policy.optimizer().compute_gradients( policy.alpha_loss, policy.model.log_alpha)], feed_dict=policy._get_loss_inputs_dict( input_, shuffle=False)) tf_c_grads = [g for g, v in tf_c_grads] tf_a_grads = [g for g, v in tf_a_grads] tf_e_grads = [g for g, v in tf_e_grads] elif fw == "tfe": with tf.GradientTape() as tape: tf_loss(policy, policy.model, None, input_) c, a, e, t = policy.critic_loss, policy.actor_loss, \ policy.alpha_loss, policy.td_error vars = tape.watched_variables() tf_c_grads = tape.gradient(c[0], vars[6:10]) tf_a_grads = tape.gradient(a, vars[2:6]) tf_e_grads = tape.gradient(e, vars[10]) elif fw == "torch": loss_torch(policy, policy.model, None, input_) c, a, e, t = policy.critic_loss, policy.actor_loss, \ policy.alpha_loss, policy.model.td_error # Test actor gradients. policy.actor_optim.zero_grad() assert all(v.grad is None for v in policy.model.q_variables()) assert all(v.grad is None for v in policy.model.policy_variables()) assert policy.model.log_alpha.grad is None a.backward() # `actor_loss` depends on Q-net vars (but these grads must # be ignored and overridden in critic_loss.backward!). assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.policy_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() if v.grad is not None ] check(tf_a_grads[2], np.transpose(torch_a_grads[0].detach().cpu())) # Test critic gradients. policy.critic_optims[0].zero_grad() assert all( torch.mean(v.grad) == 0.0 for v in policy.model.q_variables() if v.grad is not None) assert all( torch.min(v.grad) == 0.0 for v in policy.model.q_variables() if v.grad is not None) assert policy.model.log_alpha.grad is None c[0].backward() assert not all( torch.mean(v.grad) == 0 for v in policy.model.q_variables() if v.grad is not None) assert not all( torch.min(v.grad) == 0 for v in policy.model.q_variables() if v.grad is not None) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] check(tf_c_grads[0], np.transpose(torch_c_grads[2].detach().cpu())) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] check(tf_a_grads[2], np.transpose(torch_a_grads[0].detach().cpu())) # Test alpha gradient. policy.alpha_optim.zero_grad() assert policy.model.log_alpha.grad is None e.backward() assert policy.model.log_alpha.grad is not None check(policy.model.log_alpha.grad, tf_e_grads) check(c, expect_c) check(a, expect_a) check(e, expect_e) check(t, expect_t) # Store this framework's losses in prev_fw_loss to compare with # next framework's outputs. if prev_fw_loss is not None: check(c, prev_fw_loss[0]) check(a, prev_fw_loss[1]) check(e, prev_fw_loss[2]) check(t, prev_fw_loss[3]) prev_fw_loss = (c, a, e, t) # Update weights from our batch (n times). for update_iteration in range(5): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) tf_inputs.append(in_) # Set a fake-batch to use # (instead of sampling from replay buffer). buf = LocalReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() updated_weights = policy.get_weights() # Net must have changed. if tf_updated_weights: check(updated_weights["default_policy/fc_1/kernel"], tf_updated_weights[-1] ["default_policy/fc_1/kernel"], false=True) tf_updated_weights.append(updated_weights) # Compare with updated tf-weights. Must all be the same. else: tf_weights = tf_updated_weights[update_iteration] in_ = tf_inputs[update_iteration] # Set a fake-batch to use # (instead of sampling from replay buffer). buf = LocalReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() # Compare updated model. for tf_key in sorted(tf_weights.keys()): if re.search("_[23]|alpha", tf_key): continue tf_var = tf_weights[tf_key] torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var.detach().cpu()), atol=0.003) else: check(tf_var, torch_var, atol=0.003) # And alpha. check(policy.model.log_alpha, tf_weights["default_policy/log_alpha"]) # Compare target nets. for tf_key in sorted(tf_weights.keys()): if not re.search("_[23]", tf_key): continue tf_var = tf_weights[tf_key] torch_var = policy.target_model.state_dict()[ map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var.detach().cpu()), atol=0.003) else: check(tf_var, torch_var, atol=0.003) trainer.stop()
if render: env.render() if done == 1 and reward > 0: break if __name__ == "__main__": ray.shutdown() ray.init(ignore_reinit_error=True) config = sac.DEFAULT_CONFIG.copy() config["log_level"] = "WARN" register_env("projectile-v0", lambda config: Projectile_v0()) # train a policy with RLlib using SAC agent = sac.SACTrainer(config, env=SELECT_ENV) checkpoint_path, reward_history = train_policy(agent, CHECKPOINT_PATH) print(reward_history) # apply the trained policy in a use case agent.restore(checkpoint_path) env = gym.make(SELECT_ENV) rollout_actions(agent, env)
config['evaluation_config'] = {'explore': False} config['evaluation_num_workers'] = 0 # 0 = Berechnung auf worker für training # [Fortgeschrittene Einstellungen] config[ 'num_gpus'] = 0 # Anzahl der GPUs bei der Berechnung, 0 = keine GPU-Berechnung config['framework'] = 'tf2' # tf, tfe, tf2 verfügbar config['eager_tracing'] = True config['rollout_fragment_length'] = 1 config['train_batch_size'] = 1 config['explore'] = False config['normalize_actions'] = True # [Verwenden vorheriger Daten] config['input'] = os.path.join(os.path.dirname(__file__), 'Output_Data_{}'.format(trainer)) config['input_evaluation'] = [] ray.init() if trainer == 'PPO': from ray.rllib.agents import ppo agent = ppo.PPOTrainer(env=dummyenv, config=config) elif trainer == 'SAC': from ray.rllib.agents import sac agent = sac.SACTrainer(env=dummyenv, config=config) for n in range(10): result = agent.train()
for el in self.interfaces: ray.get(el.clear_callback.remote()) time.sleep(INTERVAL) self.end_episode(eid, obs) register_env("moody", lambda _: MoodyEnvLoop(interfaces, my_config, INTERVAL)) config = sac.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 0 config["eager"] = False config["timesteps_per_iteration"] = 20 config["learning_starts"] = 60 config['use_state_preprocessor'] = True # Required to prevent rllib from thinking we subclass gym env config["normalize_actions"] = False trainer = sac.SACTrainer(config=config, env="moody") print("Beginning training.") for i in range(0, 100): print(i) result = trainer.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) trainer.save()
temp_env = Nav2dEnv() action_space = temp_env.action_space observation_space = temp_env.observation_space print("ACTION SPACE : ", action_space, "OBSERVATION SPACE : ", observation_space) #### Config ################################################## config = { "framework": "tf", "env": temp_env_name, "num_workers": 1, } #### Restore agent ######################################### agent = sac.SACTrainer(config=config) checkpoint = r"C:\Users\Arcn\Desktop\twodenv\single\results\checkpoint_950\checkpoint-950" agent.restore(checkpoint) #### Create test environment ############################### test_env = Nav2dEnv() for ep in range(10): #### Show, record a video, and log the model's performance # obs = test_env.reset() action = action_space.sample() start = time.time() policy = agent.get_policy()
def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() config["num_workers"] = 0 # Run locally. config["n_step"] = 3 config["twin_q"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True config["rollout_fragment_length"] = 10 config["train_batch_size"] = 10 # If we use default buffer size (1e6), the buffer will take up # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021) # available system memory (8.34816 GB). config["buffer_size"] = 40000 # Test with saved replay buffer. config["store_buffer_in_checkpoints"] = True num_iterations = 1 ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) ModelCatalog.register_custom_model("batch_norm_torch", TorchBatchNormModel) image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) simple_space = Box(-1.0, 1.0, shape=(3, )) tune.register_env( "random_dict_env", lambda _: RandomEnv({ "observation_space": Dict({ "a": simple_space, "b": Discrete(2), "c": image_space, }), "action_space": Box(-1.0, 1.0, shape=(1, )), }), ) tune.register_env( "random_tuple_env", lambda _: RandomEnv({ "observation_space": Tuple([simple_space, Discrete(2), image_space]), "action_space": Box(-1.0, 1.0, shape=(1, )), }), ) for fw in framework_iterator(config, with_eager_tracing=True): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ "random_dict_env", "random_tuple_env", # "MsPacmanNoFrameskip-v4", "CartPole-v0", ]: print("Env={}".format(env)) # Test making the Q-model a custom one for CartPole, otherwise, # use the default model. config["Q_model"]["custom_model"] = ( "batch_norm{}".format("_torch" if fw == "torch" else "") if env == "CartPole-v0" else None) trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) # Test, whether the replay buffer is saved along with # a checkpoint (no point in doing it for all frameworks since # this is framework agnostic). if fw == "tf" and env == "CartPole-v0": checkpoint = trainer.save() new_trainer = sac.SACTrainer(config, env=env) new_trainer.restore(checkpoint) # Get some data from the buffer and compare. data = trainer.local_replay_buffer.replay_buffers[ "default_policy"]._storage[:42 + 42] new_data = new_trainer.local_replay_buffer.replay_buffers[ "default_policy"]._storage[:42 + 42] check(data, new_data) new_trainer.stop() trainer.stop()
def render(checkpoint, home_path): """ Renders pybullet and mujoco environments. """ alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0) current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0) checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint) config = json.load(open(home_path + "params.json")) config_bin = pickle.load(open(home_path + "params.pkl", "rb")) ray.shutdown() import pybullet_envs ray.init() ModelCatalog.register_custom_model("RBF", RBFModel) ModelCatalog.register_custom_model("MLP_2_64", MLP) ModelCatalog.register_custom_model("linear", Linear) if alg == "PPO": trainer = ppo.PPOTrainer(config_bin) if alg == "SAC": trainer = sac.SACTrainer(config) if alg == "DDPG": trainer = ddpg.DDPGTrainer(config) if alg == "PG": trainer = pg.PGTrainer(config) if alg == "A3C": trainer = a3c.A3CTrainer(config) if alg == "TD3": trainer = td3.TD3Trainer(config) if alg == "ES": trainer = es.ESTrainer(config) if alg == "ARS": trainer = ars.ARSTrainer(config) # "normalize_actions": true, trainer.restore(checkpoint_path) if "Bullet" in current_env: env = gym.make(current_env, render=True) else: env = gym.make(current_env) #env.unwrapped.reset_model = det_reset_model env._max_episode_steps = 10000 obs = env.reset() action_hist = [] m_act_hist = [] state_hist = [] obs_hist = [] reward_hist = [] done = False step = 0 for t in range(10000): # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now # mean_actions = out_dict['behaviour_logits'][:17] # actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) sampled_actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) actions = sampled_actions obs, reward, done, _ = env.step(np.asarray(actions)) # env.camera_adjust() env.render(mode='human') time.sleep(0.01) # env.render() # env.render(mode='rgb_array', close = True) # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0]) # if step % 1000 == 0: # env.reset() # step += 1 action_hist.append(np.copy(actions)) obs_hist.append(np.copy(obs)) reward_hist.append(np.copy(reward)) if done: obs = env.reset() # print(sum(reward_hist)) # print((obs_hist)) #plt.plot(action_hist) #plt.figure() #plt.figure() #plt.plot(obs_hist) #plt.figure() # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit) # trainer.compute_action(obs, full_fetch=True) trainer.compute_action(obs)
config = sac.DEFAULT_CONFIG.copy() # Add a new RE3UpdateCallbacks config["callbacks"] = MultiCallbacks([ config["callbacks"], partial( RE3UpdateCallbacks, embeds_dim=128, beta_schedule="linear_decay", k_nn=50, ), ]) config["env"] = "LunarLanderContinuous-v2" config["seed"] = 12345 # Add type as RE3 in the exploration_config parameter config["exploration_config"] = { "type": "RE3", "sub_exploration": { "type": "StochasticSampling", }, } num_iterations = 2000 trainer = sac.SACTrainer(config=config) for i in range(num_iterations): result = trainer.train() print(result) trainer.stop() ray.shutdown()
trial=analysis.get_best_trial("episode_reward_mean"), metric="episode_reward_mean") print('checkpoints=', checkpoints) checkpoint_path, reward = checkpoints[0] print('checkpoint_path=', checkpoint_path) config = { "env": env_name, "num_gpus": 0, "num_workers": 1, "framework": "tf2", } #agent = ppo.PPOTrainer(config=config, env=env_name) agent = sac.SACTrainer(config=config, env=env_name) agent.restore(checkpoint_path) print('agent=', agent) ######################################## import gym # instantiate env class env = gym.make(env_name) # run until episode ends episode_reward = 0 done = False obs = env.reset() while not done:
ModelCatalog.register_custom_model("MLPModel", MLPModel) ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2) if algorithm == 'A2C': RLAgent = a2c.A2CTrainer(env=env_name, config=config) elif algorithm == 'APEX_DDPG': RLAgent = apex.ApexDDPGTrainer(env=env_name, config=config) elif algorithm == 'DDPG': RLAgent = ddpg.DDPGTrainer(env=env_name, config=config) elif algorithm == 'IMPALA': RLAgent = impala.ImpalaTrainer(env=env_name, config=config) elif algorithm == 'PPO': RLAgent = ppo.PPOTrainer(env=env_name, config=config) elif algorithm == 'SAC': RLAgent = sac.SACTrainer(env=env_name, config=config) elif algorithm == 'TD3': RLAgent = td3.TD3Trainer(env=env_name, config=config) RLAgent.restore(checkpoint_path) num_runs = 50 totalRewards = np.empty((num_runs, )) for j in range(num_runs): observations = env.reset() rewards, action_dict = {}, {} for agent_id in env.agent_ids: assert isinstance(agent_id, int), "Error: agent_ids are not ints." action_dict = dict( zip(env.agent_ids, [env.action_space_dict[i].sample() for i in env.agent_ids]))