def check_support(alg, config, train=True, check_bounds=False, tfe=False): config["log_level"] = "ERROR" config["train_batch_size"] = 10 config["rollout_fragment_length"] = 10 def _do_check(alg, config, a_name, o_name): fw = config["framework"] action_space = ACTION_SPACES_TO_TEST[a_name] obs_space = OBSERVATION_SPACES_TO_TEST[o_name] print( "=== Testing {} (fw={}) A={} S={} ===".format( alg, fw, action_space, obs_space ) ) config.update( dict( env_config=dict( action_space=action_space, observation_space=obs_space, reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32), p_done=1.0, check_action_bounds=check_bounds, ) ) ) stat = "ok" try: a = get_trainer_class(alg)(config=config, env=RandomEnv) except ray.exceptions.RayActorError as e: if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException): stat = "unsupported" elif isinstance(e.args[0].args[2], UnsupportedSpaceException): stat = "unsupported" else: raise except UnsupportedSpaceException: stat = "unsupported" else: if alg not in ["DDPG", "ES", "ARS", "SAC"]: # 2D (image) input: Expect VisionNet. if o_name in ["atari", "image"]: if fw == "torch": assert isinstance(a.get_policy().model, TorchVisionNet) else: assert isinstance(a.get_policy().model, VisionNet) # 1D input: Expect FCNet. elif o_name == "vector1d": if fw == "torch": assert isinstance(a.get_policy().model, TorchFCNet) else: assert isinstance(a.get_policy().model, FCNet) # Could be either one: ComplexNet (if disabled Preprocessor) # or FCNet (w/ Preprocessor). elif o_name == "vector2d": if fw == "torch": assert isinstance( a.get_policy().model, (TorchComplexNet, TorchFCNet) ) else: assert isinstance(a.get_policy().model, (ComplexNet, FCNet)) if train: a.train() a.stop() print(stat) frameworks = ("tf", "torch") if tfe: frameworks += ("tf2", "tfe") for _ in framework_iterator(config, frameworks=frameworks): # Zip through action- and obs-spaces. for a_name, o_name in zip( ACTION_SPACES_TO_TEST.keys(), OBSERVATION_SPACES_TO_TEST.keys() ): _do_check(alg, config, a_name, o_name) # Do the remaining obs spaces. assert len(OBSERVATION_SPACES_TO_TEST) >= len(ACTION_SPACES_TO_TEST) fixed_action_key = next(iter(ACTION_SPACES_TO_TEST.keys())) for i, o_name in enumerate(OBSERVATION_SPACES_TO_TEST.keys()): if i < len(ACTION_SPACES_TO_TEST): continue _do_check(alg, config, fixed_action_key, o_name)
def test_cql_compilation(self): """Test whether a CQLTrainer can be built with all frameworks.""" # Learns from a historic-data file. # To generate this data, first run: # $ ./train.py --run=SAC --env=Pendulum-v0 \ # --stop='{"timesteps_total": 50000}' \ # --config='{"output": "/tmp/out"}' rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = cql.CQL_DEFAULT_CONFIG.copy() config["env"] = "Pendulum-v0" config["input"] = [data_file] # In the files, we use here for testing, actions have already # been normalized. # This is usually the case when the file was generated by another # RLlib algorithm (e.g. PPO or SAC). config["actions_in_input_normalized"] = False config["clip_actions"] = True config["train_batch_size"] = 2000 config["num_workers"] = 0 # Run locally. config["twin_q"] = True config["learning_starts"] = 0 config["bc_iters"] = 2 # 2 BC iters, 2 CQL iters. config["rollout_fragment_length"] = 1 # Switch on off-policy evaluation. config["input_evaluation"] = ["is"] config["evaluation_interval"] = 2 config["evaluation_num_episodes"] = 10 config["evaluation_config"]["input"] = "sampler" config["evaluation_parallel_to_training"] = False config["evaluation_num_workers"] = 2 num_iterations = 4 # Test for tf/torch frameworks. for fw in framework_iterator(config): trainer = cql.CQLTrainer(config=config) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) eval_results = results.get("evaluation") if eval_results: print(f"iter={trainer.iteration} " f"R={eval_results['episode_reward_mean']}") check_compute_single_action(trainer) # Get policy and model. pol = trainer.get_policy() cql_model = pol.model if fw == "tf": pol.get_session().__enter__() # Example on how to do evaluation on the trained Trainer # using the data from CQL's global replay buffer. # Get a sample (MultiAgentBatch -> SampleBatch). from ray.rllib.agents.cql.cql import replay_buffer batch = replay_buffer.replay().policy_batches["default_policy"] if fw == "torch": obs = torch.from_numpy(batch["obs"]) else: obs = batch["obs"] batch["actions"] = batch["actions"].astype(np.float32) # Pass the observations through our model to get the # features, which then to pass through the Q-head. model_out, _ = cql_model({"obs": obs}) # The estimated Q-values from the (historic) actions in the batch. if fw == "torch": q_values_old = cql_model.get_q_values( model_out, torch.from_numpy(batch["actions"])) else: q_values_old = cql_model.get_q_values( tf.convert_to_tensor(model_out), batch["actions"]) # The estimated Q-values for the new actions computed # by our trainer policy. actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] if fw == "torch": q_values_new = cql_model.get_q_values( model_out, torch.from_numpy(actions_new)) else: q_values_new = cql_model.get_q_values(model_out, actions_new) if fw == "tf": q_values_old, q_values_new = pol.get_session().run( [q_values_old, q_values_new]) print(f"Q-val batch={q_values_old}") print(f"Q-val policy={q_values_new}") if fw == "tf": pol.get_session().__exit__(None, None, None) trainer.stop()
def test_agent_output_logdir(self): """Test special value 'logdir' as Agent's output.""" for fw in framework_iterator(): agent = self.write_outputs("logdir", fw) self.assertEqual(len(glob.glob(agent.logdir + "/output-*.json")), 1)
def test_local(self): cf = DEFAULT_CONFIG.copy() for fw in framework_iterator(cf): agent = PPOTrainer(cf, "CartPole-v0") print(agent.train())
def ckpt_restore_test(alg_name, tfe=False): config = CONFIGS[alg_name] frameworks = (["tfe"] if tfe else []) + ["torch", "tf"] for fw in framework_iterator(config, frameworks=frameworks): for use_object_store in [False, True]: print("use_object_store={}".format(use_object_store)) cls = get_agent_class(alg_name) if "DDPG" in alg_name or "SAC" in alg_name: alg1 = cls(config=config, env="Pendulum-v0") alg2 = cls(config=config, env="Pendulum-v0") else: alg1 = cls(config=config, env="CartPole-v0") alg2 = cls(config=config, env="CartPole-v0") policy1 = alg1.get_policy() for _ in range(1): res = alg1.train() print("current status: " + str(res)) # Check optimizer state as well. optim_state = policy1.get_state().get("_optimizer_variables") # Sync the models if use_object_store: alg2.restore_from_object(alg1.save_to_object()) else: alg2.restore(alg1.save()) # Compare optimizer state with re-loaded one. if optim_state: s2 = alg2.get_policy().get_state().get("_optimizer_variables") # Tf -> Compare states 1:1. if fw in ["tf", "tfe"]: check(s2, optim_state) # For torch, optimizers have state_dicts with keys=params, # which are different for the two models (ignore these # different keys, but compare all values nevertheless). else: for i, s2_ in enumerate(s2): check( list(s2_["state"].values()), list(optim_state[i]["state"].values())) for _ in range(1): if "DDPG" in alg_name or "SAC" in alg_name: obs = np.clip( np.random.uniform(size=3), policy1.observation_space.low, policy1.observation_space.high) else: obs = np.clip( np.random.uniform(size=4), policy1.observation_space.low, policy1.observation_space.high) a1 = get_mean_action(alg1, obs) a2 = get_mean_action(alg2, obs) print("Checking computed actions", alg1, obs, a1, a2) if abs(a1 - a2) > .1: raise AssertionError("algo={} [a1={} a2={}]".format( alg_name, a1, a2))
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = SampleBatch({ SampleBatch.OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) for fw, sess in framework_iterator(config, session=True): dist_cls = (Categorical if fw != "torch" else TorchCategorical) trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() if sess: vars = policy.get_session().run(vars) # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch_ = pg.post_process_advantages(policy, train_batch.copy()) if fw == "torch": train_batch_ = policy._lazy_tensor_dict(train_batch_) # Check Advantage values. check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. if sess: results = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False)) else: results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(policy, policy.model, dist_class=dist_cls, train_batch=train_batch_) # Calculate expected results. if fw != "torch": expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), vars[2], vars[3], framework=fw) else: expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), vars[0], vars[1], framework=fw) expected_logp = dist_cls(expected_logits, policy.model).logp( train_batch_[SampleBatch.ACTIONS]) adv = train_batch_[Postprocessing.ADVANTAGES] if sess: expected_logp = sess.run(expected_logp) elif fw == "torch": expected_logp = expected_logp.detach().cpu().numpy() adv = adv.detach().cpu().numpy() else: expected_logp = expected_logp.numpy() expected_loss = -np.mean(expected_logp * adv) check(results, expected_loss, decimals=4)
def test_sac_loss_function(self): """Tests SAC loss function results across all frameworks.""" config = sac.DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 config["learning_starts"] = 0 config["twin_q"] = False config["gamma"] = 0.99 # Switch on deterministic loss so we can compare the loss values. config["_deterministic_loss"] = True # Use very simple nets. config["Q_model"]["fcnet_hiddens"] = [10] config["policy_model"]["fcnet_hiddens"] = [10] # Make sure, timing differences do not affect trainer.train(). config["min_iter_time_s"] = 0 # Test SAC with Simplex action space. config["env_config"] = {"simplex_actions": True} map_ = { # Normal net. "default_policy/sequential/action_1/kernel": "action_model." "action_0._model.0.weight", "default_policy/sequential/action_1/bias": "action_model." "action_0._model.0.bias", "default_policy/sequential/action_out/kernel": "action_model." "action_out._model.0.weight", "default_policy/sequential/action_out/bias": "action_model." "action_out._model.0.bias", "default_policy/sequential_1/q_hidden_0/kernel": "q_net." "q_hidden_0._model.0.weight", "default_policy/sequential_1/q_hidden_0/bias": "q_net." "q_hidden_0._model.0.bias", "default_policy/sequential_1/q_out/kernel": "q_net." "q_out._model.0.weight", "default_policy/sequential_1/q_out/bias": "q_net." "q_out._model.0.bias", "default_policy/value_out/kernel": "_value_branch." "_model.0.weight", "default_policy/value_out/bias": "_value_branch." "_model.0.bias", "default_policy/log_alpha": "log_alpha", # Target net. "default_policy/sequential_2/action_1/kernel": "action_model." "action_0._model.0.weight", "default_policy/sequential_2/action_1/bias": "action_model." "action_0._model.0.bias", "default_policy/sequential_2/action_out/kernel": "action_model." "action_out._model.0.weight", "default_policy/sequential_2/action_out/bias": "action_model." "action_out._model.0.bias", "default_policy/sequential_3/q_hidden_0/kernel": "q_net." "q_hidden_0._model.0.weight", "default_policy/sequential_3/q_hidden_0/bias": "q_net." "q_hidden_0._model.0.bias", "default_policy/sequential_3/q_out/kernel": "q_net." "q_out._model.0.weight", "default_policy/sequential_3/q_out/bias": "q_net." "q_out._model.0.bias", "default_policy/value_out_1/kernel": "_value_branch." "_model.0.weight", "default_policy/value_out_1/bias": "_value_branch." "_model.0.bias", "default_policy/log_alpha_1": "log_alpha", } env = SimpleEnv batch_size = 100 if env is SimpleEnv: obs_size = (batch_size, 1) actions = np.random.random(size=(batch_size, 2)) elif env == "CartPole-v0": obs_size = (batch_size, 4) actions = np.random.randint(0, 2, size=(batch_size, )) else: obs_size = (batch_size, 3) actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) # Simply compare loss values AND grads of all frameworks with each # other. prev_fw_loss = weights_dict = None expect_c, expect_a, expect_e, expect_t = None, None, None, None # History of tf-updated NN-weights over n training steps. tf_updated_weights = [] # History of input batches used. tf_inputs = [] for fw, sess in framework_iterator(config, frameworks=("tf", "torch"), session=True): # Generate Trainer and get its default Policy object. trainer = sac.SACTrainer(config=config, env=env) policy = trainer.get_policy() p_sess = None if sess: p_sess = policy.get_session() # Set all weights (of all nets) to fixed values. if weights_dict is None: # Start with the tf vars-dict. assert fw in ["tf2", "tf", "tfe"] weights_dict = policy.get_weights() if fw == "tfe": log_alpha = weights_dict[10] weights_dict = self._translate_tfe_weights( weights_dict, map_) else: assert fw == "torch" # Then transfer that to torch Model. model_dict = self._translate_weights_to_torch( weights_dict, map_) policy.model.load_state_dict(model_dict) policy.target_model.load_state_dict(model_dict) if fw == "tf": log_alpha = weights_dict["default_policy/log_alpha"] elif fw == "torch": # Actually convert to torch tensors (by accessing everything). input_ = policy._lazy_tensor_dict(input_) input_ = {k: input_[k] for k in input_.keys()} log_alpha = policy.model.log_alpha.detach().cpu().numpy()[0] # Only run the expectation once, should be the same anyways # for all frameworks. if expect_c is None: expect_c, expect_a, expect_e, expect_t = \ self._sac_loss_helper(input_, weights_dict, sorted(weights_dict.keys()), log_alpha, fw, gamma=config["gamma"], sess=sess) # Get actual outs and compare to expectation AND previous # framework. c=critic, a=actor, e=entropy, t=td-error. if fw == "tf": c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \ p_sess.run([ policy.critic_loss, policy.actor_loss, policy.alpha_loss, policy.td_error, policy.optimizer().compute_gradients( policy.critic_loss[0], policy.model.q_variables()), policy.optimizer().compute_gradients( policy.actor_loss, policy.model.policy_variables()), policy.optimizer().compute_gradients( policy.alpha_loss, policy.model.log_alpha)], feed_dict=policy._get_loss_inputs_dict( input_, shuffle=False)) tf_c_grads = [g for g, v in tf_c_grads] tf_a_grads = [g for g, v in tf_a_grads] tf_e_grads = [g for g, v in tf_e_grads] elif fw == "tfe": with tf.GradientTape() as tape: tf_loss(policy, policy.model, None, input_) c, a, e, t = policy.critic_loss, policy.actor_loss, \ policy.alpha_loss, policy.td_error vars = tape.watched_variables() tf_c_grads = tape.gradient(c[0], vars[6:10]) tf_a_grads = tape.gradient(a, vars[2:6]) tf_e_grads = tape.gradient(e, vars[10]) elif fw == "torch": loss_torch(policy, policy.model, None, input_) c, a, e, t = policy.critic_loss, policy.actor_loss, \ policy.alpha_loss, policy.td_error # Test actor gradients. policy.actor_optim.zero_grad() assert all(v.grad is None for v in policy.model.q_variables()) assert all(v.grad is None for v in policy.model.policy_variables()) assert policy.model.log_alpha.grad is None a.backward() # `actor_loss` depends on Q-net vars (but these grads must # be ignored and overridden in critic_loss.backward!). assert not any(v.grad is None for v in policy.model.q_variables()) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.policy_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.detach().cpu())) else: check(tf_g, torch_g) # Test critic gradients. policy.critic_optims[0].zero_grad() assert all( torch.mean(v.grad) == 0.0 for v in policy.model.q_variables()) assert all( torch.min(v.grad) == 0.0 for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None c[0].backward() assert not all( torch.mean(v.grad) == 0 for v in policy.model.q_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.q_variables()) assert policy.model.log_alpha.grad is None # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.detach().cpu())) else: check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.detach().cpu())) else: check(tf_g, torch_g) # Test alpha gradient. policy.alpha_optim.zero_grad() assert policy.model.log_alpha.grad is None e.backward() assert policy.model.log_alpha.grad is not None check(policy.model.log_alpha.grad, tf_e_grads) check(c, expect_c) check(a, expect_a) check(e, expect_e) check(t, expect_t) # Store this framework's losses in prev_fw_loss to compare with # next framework's outputs. if prev_fw_loss is not None: check(c, prev_fw_loss[0]) check(a, prev_fw_loss[1]) check(e, prev_fw_loss[2]) check(t, prev_fw_loss[3]) prev_fw_loss = (c, a, e, t) # Update weights from our batch (n times). for update_iteration in range(10): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) tf_inputs.append(in_) # Set a fake-batch to use # (instead of sampling from replay buffer). buf = LocalReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() updated_weights = policy.get_weights() # Net must have changed. if tf_updated_weights: check(updated_weights[ "default_policy/sequential/action_1/kernel"], tf_updated_weights[-1] ["default_policy/sequential/action_1/kernel"], false=True) tf_updated_weights.append(updated_weights) # Compare with updated tf-weights. Must all be the same. else: tf_weights = tf_updated_weights[update_iteration] in_ = tf_inputs[update_iteration] # Set a fake-batch to use # (instead of sampling from replay buffer). buf = LocalReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() # Compare updated model. for tf_key in sorted(tf_weights.keys())[2:10]: tf_var = tf_weights[tf_key] torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var.detach().cpu()), rtol=0.05) else: check(tf_var, torch_var, rtol=0.05) # And alpha. check(policy.model.log_alpha, tf_weights["default_policy/log_alpha"]) # Compare target nets. for tf_key in sorted(tf_weights.keys())[10:18]: tf_var = tf_weights[tf_key] torch_var = policy.target_model.state_dict()[ map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var.detach().cpu()), rtol=0.05) else: check(tf_var, torch_var, rtol=0.05)
def test_diag_gaussian(self): """Tests the DiagGaussian ActionDistribution for all frameworks.""" input_space = Box(-2.0, 1.0, shape=(2000, 10)) for fw, sess in framework_iterator(frameworks=("torch", "tf", "tfe"), session=True): cls = DiagGaussian if fw != "torch" else TorchDiagGaussian # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, input_space.shape, fw=fw, sess=sess) # Batch of size=n and deterministic. inputs = input_space.sample() means, _ = np.split(inputs, 2, axis=-1) diag_distribution = cls(inputs, {}) expected = means # Sample n times, expect always mean value (deterministic draw). out = diag_distribution.deterministic_sample() check(out, expected) # Batch of size=n and non-deterministic -> expect roughly the mean. inputs = input_space.sample() means, log_stds = np.split(inputs, 2, axis=-1) diag_distribution = cls(inputs, {}) expected = means values = diag_distribution.sample() if sess: values = sess.run(values) else: values = values.numpy() check(np.mean(values), expected.mean(), decimals=1) # Test log-likelihood outputs. sampled_action_logp = diag_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: sampled_action_logp = sess.run(sampled_action_logp) else: sampled_action_logp = sampled_action_logp.numpy() # NN output. means = np.array( [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]], dtype=np.float32) log_stds = np.array( [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]], dtype=np.float32) diag_distribution = cls(inputs=np.concatenate([means, log_stds], axis=-1), model={}) # Convert to parameters for distr. stds = np.exp(log_stds) # Values to get log-likelihoods for. values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]]) # get log-llh from regular gaussian. log_prob = np.sum(np.log(norm.pdf(values, means, stds)), -1) outs = diag_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: outs = sess.run(outs) check(outs, log_prob, decimals=4)
def test_multi_action_distribution(self): """Tests the MultiActionDistribution (across all frameworks).""" batch_size = 1000 input_space = Tuple([ Box(-10.0, 10.0, shape=(batch_size, 4)), Box(-2.0, 2.0, shape=( batch_size, 6, )), Dict({"a": Box(-1.0, 1.0, shape=(batch_size, 4))}), ]) std_space = Box(-0.05, 0.05, shape=( batch_size, 3, )) low, high = -1.0, 1.0 value_space = Tuple([ Box(0, 3, shape=(batch_size, ), dtype=np.int32), Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32), Dict({"a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)}) ]) for fw, sess in framework_iterator(session=True): if fw == "torch": cls = TorchMultiActionDistribution child_distr_cls = [ TorchCategorical, TorchDiagGaussian, partial(TorchBeta, low=low, high=high) ] else: cls = MultiActionDistribution child_distr_cls = [ Categorical, DiagGaussian, partial(Beta, low=low, high=high), ] inputs = list(input_space.sample()) distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4]) # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 # Sample deterministically. expected_det = [ np.argmax(inputs[0], axis=-1), inputs[1][:, :3], # [:3]=Mean values. # Mean for a Beta distribution: # 1 / [1 + (beta/alpha)] * range + low (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) * (high - low) + low, ] out = distr.deterministic_sample() if sess: out = sess.run(out) check(out[0], expected_det[0]) check(out[1], expected_det[1]) check(out[2]["a"], expected_det[2]) # Stochastic sampling -> expect roughly the mean. inputs = list(input_space.sample()) # Fix categorical inputs (not needed for distribution itself, but # for our expectation calculations). inputs[0] = softmax(inputs[0], -1) # Fix std inputs (shouldn't be too large for this test). inputs[1][:, 3:] = std_space.sample() # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4]) expected_mean = [ np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)), inputs[1][:, :3], # [:3]=Mean values. # Mean for a Beta distribution: # 1 / [1 + (beta/alpha)] * range + low (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) * (high - low) + low, ] out = distr.sample() if sess: out = sess.run(out) out = list(out) if fw == "torch": out[0] = out[0].numpy() out[1] = out[1].numpy() out[2]["a"] = out[2]["a"].numpy() check(np.mean(out[0]), expected_mean[0], decimals=1) check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1) check(np.mean(out[2]["a"], 0), np.mean(expected_mean[2], 0), decimals=1) # Test log-likelihood outputs. # Make sure beta-values are within 0.0 and 1.0 for the numpy # calculation (which doesn't have scaling). inputs = list(input_space.sample()) # Adjust inputs for the Beta distr just as Beta itself does. inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER), -np.log(SMALL_NUMBER)) inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0 distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]], axis=1), model={}, action_space=value_space, child_distributions=child_distr_cls, input_lens=[4, 6, 4]) inputs[0] = softmax(inputs[0], -1) values = list(value_space.sample()) log_prob_beta = np.log( beta.pdf(values[2]["a"], inputs[2]["a"][:, :2], inputs[2]["a"][:, 2:])) # Now do the up-scaling for [2] (beta values) to be between # low/high. values[2]["a"] = values[2]["a"] * (high - low) + low inputs[1][:, 3:] = np.exp(inputs[1][:, 3:]) expected_log_llh = np.sum( np.concatenate([ np.expand_dims( np.log( [i[values[0][j]] for j, i in enumerate(inputs[0])]), -1), np.log( norm.pdf(values[1], inputs[1][:, :3], inputs[1][:, 3:])), log_prob_beta ], -1), -1) values[0] = np.expand_dims(values[0], -1) if fw == "torch": values = tree.map_structure(lambda s: torch.Tensor(s), values) # Test all flattened input. concat = np.concatenate(tree.flatten(values), -1).astype(np.float32) out = distr.logp(concat) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15) # Test structured input. out = distr.logp(values) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15) # Test flattened input. out = distr.logp(tree.flatten(values)) if sess: out = sess.run(out) check(out, expected_log_llh, atol=15)
def test_multi_categorical(self): batch_size = 100 num_categories = 3 num_sub_distributions = 5 # Create 5 categorical distributions of 3 categories each. inputs_space = Box(-1.0, 2.0, shape=(batch_size, num_sub_distributions * num_categories)) values_space = Box(0, num_categories - 1, shape=(num_sub_distributions, batch_size), dtype=np.int32) inputs = inputs_space.sample() input_lengths = [num_categories] * num_sub_distributions inputs_split = np.split(inputs, num_sub_distributions, axis=1) for fw, sess in framework_iterator(session=True): # Create the correct distribution object. cls = MultiCategorical if fw != "torch" else TorchMultiCategorical multi_categorical = cls(inputs, None, input_lengths) # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, inputs_space.shape, fw=fw, sess=sess, bounds=(0, num_categories - 1), extra_kwargs={"input_lens": input_lengths}) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs_split, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = multi_categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = multi_categorical.sample() check(tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0) # Test log-likelihood outputs. probs = softmax(inputs_split) values = values_space.sample() out = multi_categorical.logp(values if fw != "torch" else [ torch.Tensor(values[i]) for i in range(num_sub_distributions) ]) # v in np.stack(values, 1)]) expected = [] for i in range(batch_size): expected.append( np.sum( np.log( np.array([ probs[j][i][values[j][i]] for j in range(num_sub_distributions) ])))) check(out, expected, decimals=4) # Test entropy outputs. out = multi_categorical.entropy() expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1) check(out, expected_entropy)
def test_squashed_gaussian(self): """Tests the SquashedGaussian ActionDistribution for all frameworks.""" input_space = Box(-2.0, 2.0, shape=(2000, 10)) low, high = -2.0, 1.0 for fw, sess in framework_iterator(frameworks=("torch", "tf", "tfe"), session=True): cls = SquashedGaussian if fw != "torch" else TorchSquashedGaussian # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, input_space.shape, fw=fw, sess=sess, bounds=(low, high)) # Batch of size=n and deterministic. inputs = input_space.sample() means, _ = np.split(inputs, 2, axis=-1) squashed_distribution = cls(inputs, {}, low=low, high=high) expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low # Sample n times, expect always mean value (deterministic draw). out = squashed_distribution.deterministic_sample() check(out, expected) # Batch of size=n and non-deterministic -> expect roughly the mean. inputs = input_space.sample() means, log_stds = np.split(inputs, 2, axis=-1) squashed_distribution = cls(inputs, {}, low=low, high=high) expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low values = squashed_distribution.sample() if sess: values = sess.run(values) else: values = values.numpy() self.assertTrue(np.max(values) <= high) self.assertTrue(np.min(values) >= low) check(np.mean(values), expected.mean(), decimals=1) # Test log-likelihood outputs. sampled_action_logp = squashed_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: sampled_action_logp = sess.run(sampled_action_logp) else: sampled_action_logp = sampled_action_logp.numpy() # Convert to parameters for distr. stds = np.exp( np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT)) # Unsquash values, then get log-llh from regular gaussian. # atanh_in = np.clip((values - low) / (high - low) * 2.0 - 1.0, # -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER) normed_values = (values - low) / (high - low) * 2.0 - 1.0 save_normed_values = np.clip(normed_values, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER) unsquashed_values = np.arctanh(save_normed_values) log_prob_unsquashed = np.sum( np.log(norm.pdf(unsquashed_values, means, stds)), -1) log_prob = log_prob_unsquashed - \ np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2), axis=-1) check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05) # NN output. means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]]) log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]]) squashed_distribution = cls(inputs=np.concatenate( [means, log_stds], axis=-1), model={}, low=low, high=high) # Convert to parameters for distr. stds = np.exp(log_stds) # Values to get log-likelihoods for. values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05], [-0.9, -0.2, 0.4, -0.1, -1.05]]) # Unsquash values, then get log-llh from regular gaussian. unsquashed_values = np.arctanh((values - low) / (high - low) * 2.0 - 1.0) log_prob_unsquashed = \ np.sum(np.log(norm.pdf(unsquashed_values, means, stds)), -1) log_prob = log_prob_unsquashed - \ np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2), axis=-1) outs = squashed_distribution.logp( values if fw != "torch" else torch.Tensor(values)) if sess: outs = sess.run(outs) check(outs, log_prob, decimals=4)
def test_a2c_exec_impl(ray_start_regular): config = {"min_iter_time_s": 0} for _ in framework_iterator(config, ("tf", "torch")): trainer = a3c.A2CTrainer(env="CartPole-v0", config=config) assert isinstance(trainer.train(), dict) check_compute_single_action(trainer)
def test_sac_compilation(self): """Tests whether an SACTrainer can be built with all frameworks.""" config = sac.DEFAULT_CONFIG.copy() config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy() config["num_workers"] = 0 # Run locally. config["n_step"] = 3 config["twin_q"] = True config["learning_starts"] = 0 config["prioritized_replay"] = True config["rollout_fragment_length"] = 10 config["train_batch_size"] = 10 # If we use default buffer size (1e6), the buffer will take up # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021) # available system memory (8.34816 GB). config["replay_buffer_config"]["capacity"] = 40000 # Test with saved replay buffer. config["store_buffer_in_checkpoints"] = True num_iterations = 1 ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel) ModelCatalog.register_custom_model("batch_norm_torch", TorchBatchNormModel) image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) simple_space = Box(-1.0, 1.0, shape=(3,)) tune.register_env( "random_dict_env", lambda _: RandomEnv( { "observation_space": Dict( { "a": simple_space, "b": Discrete(2), "c": image_space, } ), "action_space": Box(-1.0, 1.0, shape=(1,)), } ), ) tune.register_env( "random_tuple_env", lambda _: RandomEnv( { "observation_space": Tuple( [simple_space, Discrete(2), image_space] ), "action_space": Box(-1.0, 1.0, shape=(1,)), } ), ) for fw in framework_iterator(config, with_eager_tracing=True): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ "random_dict_env", "random_tuple_env", # "MsPacmanNoFrameskip-v4", "CartPole-v0", ]: print("Env={}".format(env)) # Test making the Q-model a custom one for CartPole, otherwise, # use the default model. config["Q_model"]["custom_model"] = ( "batch_norm{}".format("_torch" if fw == "torch" else "") if env == "CartPole-v0" else None ) trainer = sac.SACTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) # Test, whether the replay buffer is saved along with # a checkpoint (no point in doing it for all frameworks since # this is framework agnostic). if fw == "tf" and env == "CartPole-v0": checkpoint = trainer.save() new_trainer = sac.SACTrainer(config, env=env) new_trainer.restore(checkpoint) # Get some data from the buffer and compare. data = trainer.local_replay_buffer.replay_buffers[ "default_policy" ]._storage[: 42 + 42] new_data = new_trainer.local_replay_buffer.replay_buffers[ "default_policy" ]._storage[: 42 + 42] check(data, new_data) new_trainer.stop() trainer.stop()
def test_marwil_loss_function(self): """ To generate the historic data used in this test case, first run: $ ./train.py --run=PPO --env=CartPole-v0 \ --stop='{"timesteps_total": 50000}' \ --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}' """ rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/cartpole/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = marwil.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Learn from offline data. config["input"] = [data_file] for fw, sess in framework_iterator(config, session=True): reader = JsonReader(inputs=[data_file]) batch = reader.next() trainer = marwil.MARWILTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() model = policy.model # Calculate our own expected values (to then compare against the # agent's loss output). cummulative_rewards = compute_advantages( batch, 0.0, config["gamma"], 1.0, False, False )["advantages"] if fw == "torch": cummulative_rewards = torch.tensor(cummulative_rewards) if fw != "tf": batch = policy._lazy_tensor_dict(batch) model_out, _ = model(batch) vf_estimates = model.value_function() if fw == "tf": model_out, vf_estimates = policy.get_session().run( [model_out, vf_estimates] ) adv = cummulative_rewards - vf_estimates if fw == "torch": adv = adv.detach().cpu().numpy() adv_squared = np.mean(np.square(adv)) c_2 = 100.0 + 1e-8 * (adv_squared - 100.0) c = np.sqrt(c_2) exp_advs = np.exp(config["beta"] * (adv / c)) dist = policy.dist_class(model_out, model) logp = dist.logp(batch["actions"]) if fw == "torch": logp = logp.detach().cpu().numpy() elif fw == "tf": logp = sess.run(logp) # Calculate all expected loss components. expected_vf_loss = 0.5 * adv_squared expected_pol_loss = -1.0 * np.mean(exp_advs * logp) expected_loss = expected_pol_loss + config["vf_coeff"] * expected_vf_loss # Calculate the algorithm's loss (to check against our own # calculation above). batch.set_get_interceptor(None) postprocessed_batch = policy.postprocess_trajectory(batch) loss_func = ( marwil.marwil_tf_policy.marwil_loss if fw != "torch" else marwil.marwil_torch_policy.marwil_loss ) if fw != "tf": policy._lazy_tensor_dict(postprocessed_batch) loss_out = loss_func( policy, model, policy.dist_class, postprocessed_batch ) else: loss_out, v_loss, p_loss = policy.get_session().run( [policy._loss, policy.loss.v_loss, policy.loss.p_loss], feed_dict=policy._get_loss_inputs_dict( postprocessed_batch, shuffle=False ), ) # Check all components. if fw == "torch": check(policy.v_loss, expected_vf_loss, decimals=4) check(policy.p_loss, expected_pol_loss, decimals=4) elif fw == "tf": check(v_loss, expected_vf_loss, decimals=4) check(p_loss, expected_pol_loss, decimals=4) else: check(policy.loss.v_loss, expected_vf_loss, decimals=4) check(policy.loss.p_loss, expected_pol_loss, decimals=4) check(loss_out, expected_loss, decimals=3)
def evaluate_test(algo, env="CartPole-v0", test_episode_rollout=False): extra_config = "" if algo == "ARS": extra_config = ',"train_batch_size": 10, "noise_size": 250000' elif algo == "ES": extra_config = ( ',"episodes_per_batch": 1,"train_batch_size": 10, ' '"noise_size": 250000' ) for fw in framework_iterator(frameworks=("tf", "torch")): fw_ = ', "framework": "{}"'.format(fw) tmp_dir = os.popen("mktemp -d").read()[:-1] if not os.path.exists(tmp_dir): sys.exit(1) print("Saving results to {}".format(tmp_dir)) rllib_dir = str(Path(__file__).parent.parent.absolute()) print("RLlib dir = {}\nexists={}".format(rllib_dir, os.path.exists(rllib_dir))) os.system( "python {}/train.py --local-dir={} --run={} " "--checkpoint-freq=1 ".format(rllib_dir, tmp_dir, algo) + "--config='{" + '"num_workers": 1, "num_gpus": 0{}{}'.format(fw_, extra_config) + ', "min_sample_timesteps_per_reporting": 5,' '"min_time_s_per_reporting": 0.1, ' '"model": {"fcnet_hiddens": [10]}' "}' --stop='{\"training_iteration\": 1}'" + " --env={}".format(env) ) checkpoint_path = os.popen( "ls {}/default/*/checkpoint_000001/checkpoint-1".format(tmp_dir) ).read()[:-1] if not os.path.exists(checkpoint_path): sys.exit(1) print("Checkpoint path {} (exists)".format(checkpoint_path)) # Test rolling out n steps. os.popen( 'python {}/evaluate.py --run={} "{}" --steps=10 ' '--out="{}/rollouts_10steps.pkl" --no-render'.format( rllib_dir, algo, checkpoint_path, tmp_dir ) ).read() if not os.path.exists(tmp_dir + "/rollouts_10steps.pkl"): sys.exit(1) print("evaluate output (10 steps) exists!") # Test rolling out 1 episode. if test_episode_rollout: os.popen( 'python {}/evaluate.py --run={} "{}" --episodes=1 ' '--out="{}/rollouts_1episode.pkl" --no-render'.format( rllib_dir, algo, checkpoint_path, tmp_dir ) ).read() if not os.path.exists(tmp_dir + "/rollouts_1episode.pkl"): sys.exit(1) print("evaluate output (1 ep) exists!") # Cleanup. os.popen('rm -rf "{}"'.format(tmp_dir)).read()
def test_dummy_components(self): # Bazel makes it hard to find files specified in `args` # (and `data`). # Use the true absolute path. script_dir = Path(__file__).parent abs_path = script_dir.absolute() for fw, sess in framework_iterator(session=True): fw_ = fw if fw != "tfe" else "tf" # Try to create from an abstract class w/o default constructor. # Expect None. test = from_config({ "type": AbstractDummyComponent, "framework": fw_ }) check(test, None) # Create a Component via python API (config dict). component = from_config( dict(type=DummyComponent, prop_a=1.0, prop_d="non_default", framework=fw_)) check(component.prop_d, "non_default") # Create a tf Component from json file. config_file = str(abs_path.joinpath("dummy_config.json")) component = from_config(config_file, framework=fw_) check(component.prop_c, "default") check(component.prop_d, 4) # default value = component.add(3.3) if sess: value = sess.run(value) check(value, 5.3) # prop_b == 2.0 # Create a torch Component from yaml file. config_file = str(abs_path.joinpath("dummy_config.yml")) component = from_config(config_file, framework=fw_) check(component.prop_a, "something else") check(component.prop_d, 3) value = component.add(1.2) if sess: value = sess.run(value) check(value, np.array([2.2])) # prop_b == 1.0 # Create tf Component from json-string (e.g. on command line). component = from_config( '{"type": "ray.rllib.utils.tests.' 'test_framework_agnostic_components.DummyComponent", ' '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default", ' '"framework": "' + fw_ + '"}') check(component.prop_a, "A") check(component.prop_d, 4) # default value = component.add(-1.1) if sess: value = sess.run(value) check(value, -2.1) # prop_b == -1.0 # Test recognizing default module path. component = from_config( DummyComponent, '{"type": "NonAbstractChildOfDummyComponent", ' '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default",' '"framework": "' + fw_ + '"}') check(component.prop_a, "A") check(component.prop_d, 4) # default value = component.add(-1.1) if sess: value = sess.run(value) check(value, -2.1) # prop_b == -1.0 # Test recognizing default package path. scope = None if sess: scope = tf.variable_scope("exploration_object") scope.__enter__() component = from_config( Exploration, { "type": "EpsilonGreedy", "action_space": Discrete(2), "framework": fw_, "num_workers": 0, "worker_index": 0, "policy_config": {}, "model": None }) if scope: scope.__exit__(None, None, None) check(component.epsilon_schedule.outside_value, 0.05) # default # Create torch Component from yaml-string. component = from_config( "type: ray.rllib.utils.tests." "test_framework_agnostic_components.DummyComponent\n" "prop_a: B\nprop_b: -1.5\nprop_c: non-default\nframework: " "{}".format(fw_)) check(component.prop_a, "B") check(component.prop_d, 4) # default value = component.add(-5.1) if sess: value = sess.run(value) check(value, np.array([-6.6])) # prop_b == -1.5
def test_ddpg_loss_function(self): """Tests DDPG loss function results across all frameworks.""" config = ddpg.DEFAULT_CONFIG.copy() # Run locally. config["seed"] = 42 config["num_workers"] = 0 config["learning_starts"] = 0 config["twin_q"] = True config["use_huber"] = True config["huber_threshold"] = 1.0 config["gamma"] = 0.99 # Make this small (seems to introduce errors). config["l2_reg"] = 1e-10 config["prioritized_replay"] = False # Use very simple nets. config["actor_hiddens"] = [10] config["critic_hiddens"] = [10] # Make sure, timing differences do not affect trainer.train(). config["min_time_s_per_reporting"] = 0 config["timesteps_per_iteration"] = 100 map_ = { # Normal net. "default_policy/actor_hidden_0/kernel": "policy_model.action_0." "_model.0.weight", "default_policy/actor_hidden_0/bias": "policy_model.action_0." "_model.0.bias", "default_policy/actor_out/kernel": "policy_model.action_out." "_model.0.weight", "default_policy/actor_out/bias": "policy_model.action_out._model.0.bias", "default_policy/sequential/q_hidden_0/kernel": "q_model.q_hidden_0" "._model.0.weight", "default_policy/sequential/q_hidden_0/bias": "q_model.q_hidden_0." "_model.0.bias", "default_policy/sequential/q_out/kernel": "q_model.q_out._model." "0.weight", "default_policy/sequential/q_out/bias": "q_model.q_out._model.0.bias", # -- twin. "default_policy/sequential_1/twin_q_hidden_0/kernel": "twin_" "q_model.twin_q_hidden_0._model.0.weight", "default_policy/sequential_1/twin_q_hidden_0/bias": "twin_" "q_model.twin_q_hidden_0._model.0.bias", "default_policy/sequential_1/twin_q_out/kernel": "twin_" "q_model.twin_q_out._model.0.weight", "default_policy/sequential_1/twin_q_out/bias": "twin_" "q_model.twin_q_out._model.0.bias", # Target net. "default_policy/actor_hidden_0_1/kernel": "policy_model.action_0." "_model.0.weight", "default_policy/actor_hidden_0_1/bias": "policy_model.action_0." "_model.0.bias", "default_policy/actor_out_1/kernel": "policy_model.action_out." "_model.0.weight", "default_policy/actor_out_1/bias": "policy_model.action_out._model" ".0.bias", "default_policy/sequential_2/q_hidden_0/kernel": "q_model." "q_hidden_0._model.0.weight", "default_policy/sequential_2/q_hidden_0/bias": "q_model." "q_hidden_0._model.0.bias", "default_policy/sequential_2/q_out/kernel": "q_model." "q_out._model.0.weight", "default_policy/sequential_2/q_out/bias": "q_model.q_out._model.0.bias", # -- twin. "default_policy/sequential_3/twin_q_hidden_0/kernel": "twin_" "q_model.twin_q_hidden_0._model.0.weight", "default_policy/sequential_3/twin_q_hidden_0/bias": "twin_" "q_model.twin_q_hidden_0._model.0.bias", "default_policy/sequential_3/twin_q_out/kernel": "twin_" "q_model.twin_q_out._model.0.weight", "default_policy/sequential_3/twin_q_out/bias": "twin_" "q_model.twin_q_out._model.0.bias", } env = SimpleEnv batch_size = 100 obs_size = (batch_size, 1) actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) # Simply compare loss values AND grads of all frameworks with each # other. prev_fw_loss = weights_dict = None expect_c, expect_a, expect_t = None, None, None # History of tf-updated NN-weights over n training steps. tf_updated_weights = [] # History of input batches used. tf_inputs = [] for fw, sess in framework_iterator(config, frameworks=("tf", "torch"), session=True): # Generate Trainer and get its default Policy object. trainer = ddpg.DDPGTrainer(config=config, env=env) policy = trainer.get_policy() p_sess = None if sess: p_sess = policy.get_session() # Set all weights (of all nets) to fixed values. if weights_dict is None: assert fw == "tf" # Start with the tf vars-dict. weights_dict = policy.get_weights() else: assert fw == "torch" # Then transfer that to torch Model. model_dict = self._translate_weights_to_torch( weights_dict, map_) policy.model.load_state_dict(model_dict) policy.target_model.load_state_dict(model_dict) if fw == "torch": # Actually convert to torch tensors. input_ = policy._lazy_tensor_dict(input_) input_ = {k: input_[k] for k in input_.keys()} # Only run the expectation once, should be the same anyways # for all frameworks. if expect_c is None: expect_c, expect_a, expect_t = self._ddpg_loss_helper( input_, weights_dict, sorted(weights_dict.keys()), fw, gamma=config["gamma"], huber_threshold=config["huber_threshold"], l2_reg=config["l2_reg"], sess=sess, ) # Get actual outs and compare to expectation AND previous # framework. c=critic, a=actor, e=entropy, t=td-error. if fw == "tf": c, a, t, tf_c_grads, tf_a_grads = p_sess.run( [ policy.critic_loss, policy.actor_loss, policy.td_error, policy._critic_optimizer.compute_gradients( policy.critic_loss, policy.model.q_variables()), policy._actor_optimizer.compute_gradients( policy.actor_loss, policy.model.policy_variables()), ], feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), ) # Check pure loss values. check(c, expect_c) check(a, expect_a) check(t, expect_t) tf_c_grads = [g for g, v in tf_c_grads] tf_a_grads = [g for g, v in tf_a_grads] elif fw == "torch": loss_torch(policy, policy.model, None, input_) c, a, t = ( policy.get_tower_stats("critic_loss")[0], policy.get_tower_stats("actor_loss")[0], policy.get_tower_stats("td_error")[0], ) # Check pure loss values. check(c, expect_c) check(a, expect_a) check(t, expect_t) # Test actor gradients. policy._actor_optimizer.zero_grad() assert all(v.grad is None for v in policy.model.q_variables()) assert all(v.grad is None for v in policy.model.policy_variables()) a.backward() # `actor_loss` depends on Q-net vars # (but not twin-Q-net vars!). assert not any(v.grad is None for v in policy.model.q_variables()[:4]) assert all(v.grad is None for v in policy.model.q_variables()[4:]) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.policy_variables()) # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.cpu())) else: check(tf_g, torch_g) # Test critic gradients. policy._critic_optimizer.zero_grad() assert all(v.grad is None or torch.mean(v.grad) == 0.0 for v in policy.model.q_variables()) assert all(v.grad is None or torch.min(v.grad) == 0.0 for v in policy.model.q_variables()) c.backward() assert not all( torch.mean(v.grad) == 0 for v in policy.model.q_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.q_variables()) # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.cpu())) else: check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g.cpu())) else: check(tf_g, torch_g) # Store this framework's losses in prev_fw_loss to compare with # next framework's outputs. if prev_fw_loss is not None: check(c, prev_fw_loss[0]) check(a, prev_fw_loss[1]) check(t, prev_fw_loss[2]) prev_fw_loss = (c, a, t) # Update weights from our batch (n times). for update_iteration in range(6): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) tf_inputs.append(in_) # Set a fake-batch to use # (instead of sampling from replay buffer). buf = MultiAgentReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() updated_weights = policy.get_weights() # Net must have changed. if tf_updated_weights: check( updated_weights[ "default_policy/actor_hidden_0/kernel"], tf_updated_weights[-1] ["default_policy/actor_hidden_0/kernel"], false=True, ) tf_updated_weights.append(updated_weights) # Compare with updated tf-weights. Must all be the same. else: tf_weights = tf_updated_weights[update_iteration] in_ = tf_inputs[update_iteration] # Set a fake-batch to use # (instead of sampling from replay buffer). buf = MultiAgentReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() # Compare updated model and target weights. for tf_key in tf_weights.keys(): tf_var = tf_weights[tf_key] # Model. if re.search( "actor_out_1|actor_hidden_0_1|sequential_[23]", tf_key): torch_var = policy.target_model.state_dict()[ map_[tf_key]] # Target model. else: torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var.cpu()), atol=0.1) else: check(tf_var, torch_var, atol=0.1) trainer.stop()
def test_cql_compilation(self): """Test whether CQL can be built with all frameworks.""" # Learns from a historic-data file. # To generate this data, first run: # $ ./train.py --run=SAC --env=Pendulum-v1 \ # --stop='{"timesteps_total": 50000}' \ # --config='{"output": "/tmp/out"}' rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) config = ( cql.CQLConfig().environment(env="Pendulum-v1", ).offline_data( input_=[data_file], # In the files, we use here for testing, actions have already # been normalized. # This is usually the case when the file was generated by another # RLlib algorithm (e.g. PPO or SAC). actions_in_input_normalized=False, # Switch on off-policy evaluation. off_policy_estimation_methods={ "is": { "type": ImportanceSampling } }, ).training( clip_actions=False, train_batch_size=2000, twin_q=True, replay_buffer_config={ "learning_starts": 0 }, bc_iters=2, ).evaluation( always_attach_evaluation_results=True, evaluation_interval=2, evaluation_duration=10, evaluation_config={ "input": "sampler" }, evaluation_parallel_to_training=False, evaluation_num_workers=2, ).rollouts(rollout_fragment_length=1)) num_iterations = 4 # Test for tf/torch frameworks. for fw in framework_iterator(config, with_eager_tracing=True): trainer = config.build() for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) eval_results = results["evaluation"] print(f"iter={trainer.iteration} " f"R={eval_results['episode_reward_mean']}") check_compute_single_action(trainer) # Get policy and model. pol = trainer.get_policy() cql_model = pol.model if fw == "tf": pol.get_session().__enter__() # Example on how to do evaluation on the trained Trainer # using the data from CQL's global replay buffer. # Get a sample (MultiAgentBatch). multi_agent_batch = trainer.local_replay_buffer.sample( num_items=config.train_batch_size) # All experiences have been buffered for `default_policy` batch = multi_agent_batch.policy_batches["default_policy"] if fw == "torch": obs = torch.from_numpy(batch["obs"]) else: obs = batch["obs"] batch["actions"] = batch["actions"].astype(np.float32) # Pass the observations through our model to get the # features, which then to pass through the Q-head. model_out, _ = cql_model({"obs": obs}) # The estimated Q-values from the (historic) actions in the batch. if fw == "torch": q_values_old = cql_model.get_q_values( model_out, torch.from_numpy(batch["actions"])) else: q_values_old = cql_model.get_q_values( tf.convert_to_tensor(model_out), batch["actions"]) # The estimated Q-values for the new actions computed # by our trainer policy. actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0] if fw == "torch": q_values_new = cql_model.get_q_values( model_out, torch.from_numpy(actions_new)) else: q_values_new = cql_model.get_q_values(model_out, actions_new) if fw == "tf": q_values_old, q_values_new = pol.get_session().run( [q_values_old, q_values_new]) print(f"Q-val batch={q_values_old}") print(f"Q-val policy={q_values_new}") if fw == "tf": pol.get_session().__exit__(None, None, None) trainer.stop()
def test_td3_exploration_and_with_random_prerun(self): """Tests TD3's Exploration (w/ random actions for n timesteps).""" config = td3.TD3_DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. for _ in framework_iterator(config, with_eager_tracing=True): lcl_config = config.copy() # Default GaussianNoise setup. trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v1") # Setting explore=False should always return the same action. a_ = trainer.compute_single_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, 1) for i in range(50): a = trainer.compute_single_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, i + 2) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for i in range(50): actions.append(trainer.compute_single_action(obs)) self.assertEqual(trainer.get_policy().global_timestep, i + 52) check(np.std(actions), 0.0, false=True) trainer.stop() # Check randomness at beginning. lcl_config["exploration_config"] = { # Act randomly at beginning ... "random_timesteps": 30, # Then act very closely to deterministic actions thereafter. "stddev": 0.001, "initial_scale": 0.001, "final_scale": 0.001, } trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v1") # ts=0 (get a deterministic action as per explore=False). deterministic_action = trainer.compute_single_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, 1) # ts=1-29 (in random window). random_a = [] for i in range(1, 30): random_a.append( trainer.compute_single_action(obs, explore=True)) self.assertEqual(trainer.get_policy().global_timestep, i + 1) check(random_a[-1], deterministic_action, false=True) self.assertTrue(np.std(random_a) > 0.3) # ts > 30 (a=deterministic_action + scale * N[0,1]) for i in range(50): a = trainer.compute_single_action(obs, explore=True) self.assertEqual(trainer.get_policy().global_timestep, i + 31) check(a, deterministic_action, rtol=0.1) # ts >> 30 (BUT: explore=False -> expect deterministic action). for i in range(50): a = trainer.compute_single_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, i + 81) check(a, deterministic_action) trainer.stop()
def test_add_delete_policy(self): config = pg.DEFAULT_CONFIG.copy() config.update( { "env": MultiAgentCartPole, "env_config": { "config": { "num_agents": 4, }, }, "num_workers": 2, # Test on remote workers as well. "model": { "fcnet_hiddens": [5], "fcnet_activation": "linear", }, "train_batch_size": 100, "rollout_fragment_length": 50, "multiagent": { # Start with a single policy. "policies": {"p0"}, "policy_mapping_fn": lambda aid, eps, worker, **kwargs: "p0", # And only two policies that can be stored in memory at a # time. "policy_map_capacity": 2, }, } ) for _ in framework_iterator(config): trainer = pg.PGTrainer(config=config) pol0 = trainer.get_policy("p0") r = trainer.train() self.assertTrue("p0" in r["info"][LEARNER_INFO]) for i in range(1, 3): def new_mapping_fn(agent_id, episode, worker, **kwargs): return f"p{choice([i, i - 1])}" # Add a new policy. pid = f"p{i}" new_pol = trainer.add_policy( pid, trainer.get_default_policy_class(config), # Test changing the mapping fn. policy_mapping_fn=new_mapping_fn, # Change the list of policies to train. policies_to_train=[f"p{i}", f"p{i-1}"], ) pol_map = trainer.workers.local_worker().policy_map self.assertTrue(new_pol is not pol0) for j in range(i + 1): self.assertTrue(f"p{j}" in pol_map) self.assertTrue(len(pol_map) == i + 1) trainer.train() checkpoint = trainer.save() # Test restoring from the checkpoint (which has more policies # than what's defined in the config dict). test = pg.PGTrainer(config=config) test.restore(checkpoint) pol0 = test.get_policy("p0") test.train() # Test creating an action with the added (and restored) policy. a = test.compute_single_action( np.zeros_like(pol0.observation_space.sample()), policy_id=pid ) self.assertTrue(pol0.action_space.contains(a)) test.stop() # Delete all added policies again from trainer. for i in range(2, 0, -1): trainer.remove_policy( f"p{i}", # Note that the complete signature of a policy_mapping_fn # is: `agent_id, episode, worker, **kwargs`. policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}", policies_to_train=[f"p{i - 1}"], ) trainer.stop()
def learn_test_plus_evaluate(algo, env="CartPole-v0"): for fw in framework_iterator(frameworks=("tf", "torch")): fw_ = ', \\"framework\\": \\"{}\\"'.format(fw) tmp_dir = os.popen("mktemp -d").read()[:-1] if not os.path.exists(tmp_dir): # Last resort: Resolve via underlying tempdir (and cut tmp_. tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:] if not os.path.exists(tmp_dir): sys.exit(1) print("Saving results to {}".format(tmp_dir)) rllib_dir = str(Path(__file__).parent.parent.absolute()) print("RLlib dir = {}\nexists={}".format(rllib_dir, os.path.exists(rllib_dir))) os.system("python {}/train.py --local-dir={} --run={} " "--checkpoint-freq=1 --checkpoint-at-end ".format( rllib_dir, tmp_dir, algo) + '--config="{\\"num_gpus\\": 0, \\"num_workers\\": 1, ' '\\"evaluation_config\\": {\\"explore\\": false}' + fw_ + '}" ' + '--stop="{\\"episode_reward_mean\\": 100.0}"' + " --env={}".format(env)) # Find last checkpoint and use that for the rollout. checkpoint_path = os.popen( "ls {}/default/*/checkpoint_*/checkpoint-*".format( tmp_dir)).read()[:-1] checkpoints = [ cp for cp in checkpoint_path.split("\n") if re.match(r"^.+checkpoint-\d+$", cp) ] # Sort by number and pick last (which should be the best checkpoint). last_checkpoint = sorted( checkpoints, key=lambda x: int(re.match(r".+checkpoint-(\d+)", x).group(1)))[-1] assert re.match(r"^.+checkpoint_\d+/checkpoint-\d+$", last_checkpoint) if not os.path.exists(last_checkpoint): sys.exit(1) print("Best checkpoint={} (exists)".format(last_checkpoint)) # Test rolling out n steps. result = os.popen("python {}/evaluate.py --run={} " "--steps=400 " '--out="{}/rollouts_n_steps.pkl" "{}"'.format( rllib_dir, algo, tmp_dir, last_checkpoint)).read()[:-1] if not os.path.exists(tmp_dir + "/rollouts_n_steps.pkl"): sys.exit(1) print("Rollout output exists -> Checking reward ...") episodes = result.split("\n") mean_reward = 0.0 num_episodes = 0 for ep in episodes: mo = re.match(r"Episode .+reward: ([\d\.\-]+)", ep) if mo: mean_reward += float(mo.group(1)) num_episodes += 1 mean_reward /= num_episodes print("Rollout's mean episode reward={}".format(mean_reward)) assert mean_reward >= 100.0 # Cleanup. os.popen('rm -rf "{}"'.format(tmp_dir)).read()
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["model"]["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 0, matching # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw in ["tf2", "tfe"]: ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy._mean_kl_loss, policy._mean_entropy, policy._mean_policy_loss, policy._mean_vf_loss, policy._total_loss, ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) elif fw == "torch": check(policy.model.tower_stats["mean_kl_loss"], kl) check(policy.model.tower_stats["mean_entropy"], entropy) check(policy.model.tower_stats["mean_policy_loss"], np.mean(-pg_loss)) check(policy.model.tower_stats["mean_vf_loss"], np.mean(vf_loss), decimals=4) check(policy.model.tower_stats["total_loss"], overall_loss, decimals=4) else: check(policy._mean_kl_loss, kl) check(policy._mean_entropy, entropy) check(policy._mean_policy_loss, np.mean(-pg_loss)) check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy._total_loss, overall_loss, decimals=4) trainer.stop()
def do_test_parameter_noise_exploration(self, trainer_cls, config, env, env_config, obs, fws): """Tests, whether an Agent works with ParameterNoise.""" core_config = config.copy() core_config["num_workers"] = 0 # Run locally. core_config["env_config"] = env_config for fw in framework_iterator(core_config, fws): config = core_config.copy() # Algo with ParameterNoise exploration (config["explore"]=True). # ---- config["exploration_config"] = {"type": "ParameterNoise"} config["explore"] = True trainer = trainer_cls(config=config, env=env) policy = trainer.get_policy() self.assertFalse(policy.exploration.weights_are_currently_noisy) noise_before = self._get_current_noise(policy, fw) check(noise_before, 0.0) initial_weights = self._get_current_weight(policy, fw) # Pseudo-start an episode and compare the weights before and after. policy.exploration.on_episode_start(policy, tf_sess=policy._sess) self.assertFalse(policy.exploration.weights_are_currently_noisy) noise_after_ep_start = self._get_current_noise(policy, fw) weights_after_ep_start = self._get_current_weight(policy, fw) # Should be the same, as we don't do anything at the beginning of # the episode, only one step later. check(noise_after_ep_start, noise_before) check(initial_weights, weights_after_ep_start) # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) self.assertFalse(policy.exploration.weights_are_currently_noisy) noise = self._get_current_noise(policy, fw) # We sampled the first noise (not zero anymore). check(noise, 0.0, false=True) # But still not applied b/c explore=False. check(self._get_current_weight(policy, fw), initial_weights) for _ in range(10): a = trainer.compute_action(obs, explore=False) check(a, a_) # Noise never gets applied. check(self._get_current_weight(policy, fw), initial_weights) self.assertFalse( policy.exploration.weights_are_currently_noisy) # Explore=None (default: True) should return different actions. # However, this is only due to the underlying epsilon-greedy # exploration. actions = [] current_weight = None for _ in range(10): actions.append(trainer.compute_action(obs)) self.assertTrue(policy.exploration.weights_are_currently_noisy) # Now, noise actually got applied (explore=True). current_weight = self._get_current_weight(policy, fw) check(current_weight, initial_weights, false=True) check(current_weight, initial_weights + noise) check(np.std(actions), 0.0, false=True) # Pseudo-end the episode and compare weights again. # Make sure they are the original ones. policy.exploration.on_episode_end(policy, tf_sess=policy._sess) weights_after_ep_end = self._get_current_weight(policy, fw) check(current_weight - noise, weights_after_ep_end, decimals=5) # DQN with ParameterNoise exploration (config["explore"]=False). # ---- config = core_config.copy() config["exploration_config"] = {"type": "ParameterNoise"} config["explore"] = False trainer = trainer_cls(config=config, env=env) policy = trainer.get_policy() self.assertFalse(policy.exploration.weights_are_currently_noisy) initial_weights = self._get_current_weight(policy, fw) # Noise before anything (should be 0.0, no episode started yet). noise = self._get_current_noise(policy, fw) check(noise, 0.0) # Pseudo-start an episode and compare the weights before and after # (they should be the same). policy.exploration.on_episode_start(policy, tf_sess=policy._sess) self.assertFalse(policy.exploration.weights_are_currently_noisy) # Should be the same, as we don't do anything at the beginning of # the episode, only one step later. noise = self._get_current_noise(policy, fw) check(noise, 0.0) noisy_weights = self._get_current_weight(policy, fw) check(initial_weights, noisy_weights) # Setting explore=False or None should always return the same # action. a_ = trainer.compute_action(obs, explore=False) # Now we have re-sampled. noise = self._get_current_noise(policy, fw) check(noise, 0.0, false=True) for _ in range(5): a = trainer.compute_action(obs, explore=None) check(a, a_) a = trainer.compute_action(obs, explore=False) check(a, a_) # Pseudo-end the episode and compare weights again. # Make sure they are the original ones (no noise permanently # applied throughout the episode). policy.exploration.on_episode_end(policy, tf_sess=policy._sess) weights_after_episode_end = self._get_current_weight(policy, fw) check(initial_weights, weights_after_episode_end) # Noise should still be the same (re-sampling only happens at # beginning of episode). noise_after = self._get_current_noise(policy, fw) check(noise, noise_after) # Switch off underlying exploration entirely. # ---- config = core_config.copy() if trainer_cls is dqn.DQNTrainer: sub_config = { "type": "EpsilonGreedy", "initial_epsilon": 0.0, # <- no randomness whatsoever "final_epsilon": 0.0, } else: sub_config = { "type": "OrnsteinUhlenbeckNoise", "initial_scale": 0.0, # <- no randomness whatsoever "final_scale": 0.0, "random_timesteps": 0, } config["exploration_config"] = { "type": "ParameterNoise", "sub_exploration": sub_config, } config["explore"] = True trainer = trainer_cls(config=config, env=env) # Now, when we act - even with explore=True - we would expect # the same action for the same input (parameter noise is # deterministic). policy = trainer.get_policy() policy.exploration.on_episode_start(policy, tf_sess=policy._sess) a_ = trainer.compute_action(obs) for _ in range(10): a = trainer.compute_action(obs, explore=True) check(a, a_)
def test_simple_q_loss_function(self): """Tests the Simple-Q loss function results on all frameworks.""" config = simple_q.SimpleQConfig().rollouts(num_rollout_workers=0) # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). config.training(model={ "fcnet_hiddens": [10], "fcnet_activation": "linear", }) for fw in framework_iterator(config): # Generate Trainer and get its default Policy object. trainer = simple_q.SimpleQ(config=config, env="CartPole-v0") policy = trainer.get_policy() # Batch of size=2. input_ = SampleBatch({ SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), SampleBatch.ACTIONS: np.array([0, 1]), SampleBatch.REWARDS: np.array([0.4, -1.23]), SampleBatch.DONES: np.array([False, False]), SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)), SampleBatch.EPS_ID: np.array([1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0]), SampleBatch.ACTION_LOGP: np.array([-0.1, -0.1]), SampleBatch.ACTION_DIST_INPUTS: np.array([[0.1, 0.2], [-0.1, -0.2]]), SampleBatch.ACTION_PROB: np.array([0.1, 0.2]), "q_values": np.array([[0.1, 0.2], [0.2, 0.1]]), }) # Get model vars for computing expected model outs (q-vals). # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias vars = policy.get_weights() if isinstance(vars, dict): vars = list(vars.values()) vars_t = policy.target_model.variables() if fw == "tf": vars_t = policy.get_session().run(vars_t) # Q(s,a) outputs. q_t = np.sum( one_hot(input_[SampleBatch.ACTIONS], 2) * fc( fc( input_[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw, ), vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw, ), 1, ) # max[a'](Qtarget(s',a')) outputs. q_target_tp1 = np.max( fc( fc( input_[SampleBatch.NEXT_OBS], vars_t[0 if fw != "torch" else 2], vars_t[1 if fw != "torch" else 3], framework=fw, ), vars_t[2 if fw != "torch" else 0], vars_t[3 if fw != "torch" else 1], framework=fw, ), 1, ) # TD-errors (Bellman equation). td_error = q_t - config.gamma * input_[ SampleBatch.REWARDS] + q_target_tp1 # Huber/Square loss on TD-error. expected_loss = huber_loss(td_error).mean() if fw == "torch": input_ = policy._lazy_tensor_dict(input_) # Get actual out and compare. if fw == "tf": out = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False), ) else: out = (loss_torch if fw == "torch" else loss_tf)(policy, policy.model, None, input_) check(out, expected_loss, decimals=1)
def test_traj_view_simple_performance(self): """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`. """ config = copy.deepcopy(ppo.DEFAULT_CONFIG) action_space = Discrete(2) obs_space = Box(-1.0, 1.0, shape=(700, )) from ray.rllib.examples.env.random_env import RandomMultiAgentEnv from ray.tune import register_env register_env( "ma_env", lambda c: RandomMultiAgentEnv({ "num_agents": 2, "p_done": 0.0, "max_episode_len": 104, "action_space": action_space, "observation_space": obs_space })) config["num_workers"] = 3 config["num_envs_per_worker"] = 8 config["num_sgd_iter"] = 1 # Put less weight on training. policies = { "pol0": (None, obs_space, action_space, {}), } def policy_fn(agent_id): return "pol0" config["multiagent"] = { "policies": policies, "policy_mapping_fn": policy_fn, } num_iterations = 2 # Only works in torch so far. for _ in framework_iterator(config, frameworks="torch"): print("w/ traj. view API") config["_use_trajectory_view_api"] = True trainer = ppo.PPOTrainer(config=config, env="ma_env") learn_time_w = 0.0 sampler_perf_w = {} start = time.time() for i in range(num_iterations): out = trainer.train() ts = out["timesteps_total"] sampler_perf_ = out["sampler_perf"] sampler_perf_w = { k: sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts) for k, v in sampler_perf_.items() } delta = out["timers"]["learn_time_ms"] / ts learn_time_w += delta print("{}={}s".format(i, delta)) sampler_perf_w = { k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1) for k, v in sampler_perf_w.items() } duration_w = time.time() - start print("Duration: {}s " "sampler-perf.={} learn-time/iter={}s".format( duration_w, sampler_perf_w, learn_time_w / num_iterations)) trainer.stop() print("w/o traj. view API") config["_use_trajectory_view_api"] = False trainer = ppo.PPOTrainer(config=config, env="ma_env") learn_time_wo = 0.0 sampler_perf_wo = {} start = time.time() for i in range(num_iterations): out = trainer.train() ts = out["timesteps_total"] sampler_perf_ = out["sampler_perf"] sampler_perf_wo = { k: sampler_perf_wo.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts) for k, v in sampler_perf_.items() } delta = out["timers"]["learn_time_ms"] / ts learn_time_wo += delta print("{}={}s".format(i, delta)) sampler_perf_wo = { k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1) for k, v in sampler_perf_wo.items() } duration_wo = time.time() - start print("Duration: {}s " "sampler-perf.={} learn-time/iter={}s".format( duration_wo, sampler_perf_wo, learn_time_wo / num_iterations)) trainer.stop() # Assert `_use_trajectory_view_api` is faster. self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"], sampler_perf_wo["mean_raw_obs_processing_ms"]) self.assertLess(sampler_perf_w["mean_action_processing_ms"], sampler_perf_wo["mean_action_processing_ms"]) self.assertLess(duration_w, duration_wo)
def test_simple_q_loss_function(self): """Tests the Simple-Q loss function results on all frameworks.""" config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)). config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" for fw in framework_iterator(config): # Generate Trainer and get its default Policy object. trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Batch of size=2. input_ = { SampleBatch.CUR_OBS: np.random.random(size=(2, 4)), SampleBatch.ACTIONS: np.array([0, 1]), SampleBatch.REWARDS: np.array([0.4, -1.23]), SampleBatch.DONES: np.array([False, False]), SampleBatch.NEXT_OBS: np.random.random(size=(2, 4)) } # Get model vars for computing expected model outs (q-vals). # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias vars = policy.get_weights() if isinstance(vars, dict): vars = list(vars.values()) vars_t = policy.target_q_func_vars if fw == "tf": vars_t = policy.get_session().run(vars_t) # Q(s,a) outputs. q_t = np.sum( one_hot(input_[SampleBatch.ACTIONS], 2) * fc(fc(input_[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw), vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw), 1) # max[a'](Qtarget(s',a')) outputs. q_target_tp1 = np.max( fc(fc(input_[SampleBatch.NEXT_OBS], vars_t[0 if fw != "torch" else 2], vars_t[1 if fw != "torch" else 3], framework=fw), vars_t[2 if fw != "torch" else 0], vars_t[3 if fw != "torch" else 1], framework=fw), 1) # TD-errors (Bellman equation). td_error = q_t - config["gamma"] * input_[SampleBatch.REWARDS] + \ q_target_tp1 # Huber/Square loss on TD-error. expected_loss = huber_loss(td_error).mean() if fw == "torch": input_ = policy._lazy_tensor_dict(input_) # Get actual out and compare. if fw == "tf": out = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False)) else: out = (loss_torch if fw == "torch" else loss_tf)(policy, policy.model, None, input_) check(out, expected_loss, decimals=1)
def test_agent_output_ok(self): for fw in framework_iterator(frameworks=("torch", "tf")): self.write_outputs(self.test_dir, fw) self.assertEqual(len(os.listdir(self.test_dir + fw)), 1) reader = JsonReader(self.test_dir + fw + "/*.json") reader.next()
def do_test_log_likelihood(run, config, prev_a=None, continuous=False, layer_key=("fc", (0, 4), ("_hidden_layers.0.", "_logits.")), logp_func=None): config = config.copy() # Run locally. config["num_workers"] = 0 # Env setup. if continuous: env = "Pendulum-v0" obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]]) else: env = "FrozenLake-v0" config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs_batch = np.array([0]) preprocessed_obs_batch = one_hot(obs_batch, depth=16) prev_r = None if prev_a is None else np.array(0.0) # Test against all frameworks. for fw in framework_iterator(config): trainer = run(config=config, env=env) policy = trainer.get_policy() vars = policy.get_weights() # Sample n actions, then roughly check their logp against their # counts. num_actions = 1000 if not continuous else 50 actions = [] for _ in range(num_actions): # Single action from single obs. actions.append( trainer.compute_action(obs_batch[0], prev_action=prev_a, prev_reward=prev_r, explore=True)) # Test all taken actions for their log-likelihoods vs expected values. if continuous: for idx in range(num_actions): a = actions[idx] if fw != "torch": if isinstance(vars, list): expected_mean_logstd = fc( fc(obs_batch, vars[layer_key[1][0]]), vars[layer_key[1][1]]) else: expected_mean_logstd = fc( fc( obs_batch, vars["default_policy/{}_1/kernel".format( layer_key[0])]), vars["default_policy/{}_out/kernel".format( layer_key[0])]) else: expected_mean_logstd = fc( fc(obs_batch, vars["{}_model.0.weight".format(layer_key[2][0])], framework=fw), vars["{}_model.0.weight".format(layer_key[2][1])], framework=fw) mean, log_std = np.split(expected_mean_logstd, 2, axis=-1) if logp_func is None: expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std))) else: expected_logp = logp_func(mean, log_std, a) logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]) if prev_a else None, prev_reward_batch=np.array([prev_r]) if prev_r else None) check(logp, expected_logp[0], rtol=0.2) # Test all available actions for their logp values. else: for a in [0, 1, 2, 3]: count = actions.count(a) expected_prob = count / num_actions logp = policy.compute_log_likelihoods( np.array([a]), preprocessed_obs_batch, prev_action_batch=np.array([prev_a]) if prev_a else None, prev_reward_batch=np.array([prev_r]) if prev_r else None) check(np.exp(logp), expected_prob, atol=0.2)
def learn_test_multi_agent_plus_evaluate(algo): for fw in framework_iterator(frameworks=("tf", "torch")): tmp_dir = os.popen("mktemp -d").read()[:-1] if not os.path.exists(tmp_dir): # Last resort: Resolve via underlying tempdir (and cut tmp_. tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:] if not os.path.exists(tmp_dir): sys.exit(1) print("Saving results to {}".format(tmp_dir)) rllib_dir = str(Path(__file__).parent.parent.absolute()) print("RLlib dir = {}\nexists={}".format(rllib_dir, os.path.exists(rllib_dir))) def policy_fn(agent_id, episode, **kwargs): return "pol{}".format(agent_id) config = { "num_gpus": 0, "num_workers": 1, "evaluation_config": { "explore": False }, "framework": fw, "env": MultiAgentCartPole, "multiagent": { "policies": {"pol0", "pol1"}, "policy_mapping_fn": policy_fn, }, } stop = {"episode_reward_mean": 100.0} tune.run(algo, config=config, stop=stop, checkpoint_freq=1, checkpoint_at_end=True, local_dir=tmp_dir, verbose=1) # Find last checkpoint and use that for the rollout. checkpoint_path = os.popen("ls {}/PPO/*/checkpoint_*/" "checkpoint-*".format(tmp_dir)).read()[:-1] checkpoint_paths = checkpoint_path.split("\n") assert len(checkpoint_paths) > 0 checkpoints = [ cp for cp in checkpoint_paths if re.match(r"^.+checkpoint-\d+$", cp) ] # Sort by number and pick last (which should be the best checkpoint). last_checkpoint = sorted( checkpoints, key=lambda x: int(re.match(r".+checkpoint-(\d+)", x).group(1)))[-1] assert re.match(r"^.+checkpoint_\d+/checkpoint-\d+$", last_checkpoint) if not os.path.exists(last_checkpoint): sys.exit(1) print("Best checkpoint={} (exists)".format(last_checkpoint)) ray.shutdown() # Test rolling out n steps. result = os.popen( "python {}/evaluate.py --run={} " "--steps=400 " "--out=\"{}/rollouts_n_steps.pkl\" --no-render \"{}\"".format( rllib_dir, algo, tmp_dir, last_checkpoint)).read()[:-1] if not os.path.exists(tmp_dir + "/rollouts_n_steps.pkl"): sys.exit(1) print("Rollout output exists -> Checking reward ...") episodes = result.split("\n") mean_reward = 0.0 num_episodes = 0 for ep in episodes: mo = re.match(r"Episode .+reward: ([\d\.\-]+)", ep) if mo: mean_reward += float(mo.group(1)) num_episodes += 1 mean_reward /= num_episodes print("Rollout's mean episode reward={}".format(mean_reward)) assert mean_reward >= 100.0 # Cleanup. os.popen("rm -rf \"{}\"".format(tmp_dir)).read()
def test_dqn_exploration_and_soft_q_config(self): """Tests, whether a DQN Agent outputs exploration/softmaxed actions.""" config = dqn.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["env_config"] = {"is_slippery": False, "map_name": "4x4"} obs = np.array(0) # Test against all frameworks. for _ in framework_iterator(config): # Default EpsilonGreedy setup. trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Setting explore=False should always return the same action. a_ = trainer.compute_single_action(obs, explore=False) for _ in range(50): a = trainer.compute_single_action(obs, explore=False) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for _ in range(50): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop() # Low softmax temperature. Behaves like argmax # (but no epsilon exploration). config["exploration_config"] = { "type": "SoftQ", "temperature": 0.000001 } trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Due to the low temp, always expect the same action. actions = [trainer.compute_single_action(obs)] for _ in range(50): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, decimals=3) trainer.stop() # Higher softmax temperature. config["exploration_config"]["temperature"] = 1.0 trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") # Even with the higher temperature, if we set explore=False, we # should expect the same actions always. a_ = trainer.compute_single_action(obs, explore=False) for _ in range(50): a = trainer.compute_single_action(obs, explore=False) check(a, a_) # Due to the higher temp, expect different actions avg'ing # around 1.5. actions = [] for _ in range(300): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop() # With Random exploration. config["exploration_config"] = {"type": "Random"} config["explore"] = True trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0") actions = [] for _ in range(300): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop()