def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000] expected = [ 0.5 + (2.0 - 0.5) * (1.0 - min(t, 100) / 100)**2 for t in ts ] config = dict( type= "ray.rllib.utils.schedules.polynomial_schedule.PolynomialSchedule", schedule_timesteps=100, initial_p=2.0, final_p=0.5, power=2.0, ) for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): polynomial = from_config(config, framework=fw) for t, e in zip(ts, expected): out = polynomial(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = polynomial(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_constant_schedule(self): value = 2.3 ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56] for fw in ["tf", "torch", None]: constant = from_config(ConstantSchedule, dict(value=value, framework=fw)) for t in ts: out = constant(t) check(out, value)
def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000] config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6} for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" linear = from_config(LinearSchedule, config, framework=fw_) for t in ts: out = linear(t) check(out, 2.1 - (min(t, 100) / 100) * (2.1 - 0.6), decimals=4)
def test_piecewise_schedule(self): piecewise = from_config( PiecewiseSchedule, dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5)) ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0] for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4)
def test_exponential_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] config = dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100) for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" exponential = from_config( ExponentialSchedule, config, framework=fw_) for t in ts: out = exponential(t) check(out, 2.0 * 0.99**(t / 100), decimals=4)
def test_piecewise_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0] config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5) for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" piecewise = from_config(PiecewiseSchedule, config, framework=fw_) for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4)
def test_constant_schedule(self): value = 2.3 ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56] config = {"value": value} for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" constant = from_config(ConstantSchedule, config, framework=fw_) for t in ts: out = constant(t) check(out, value)
def test_exponential_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] for fw in ["tf", "torch", None]: exponential = from_config( ExponentialSchedule, dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100, framework=fw)) for t in ts: out = exponential(t) check(out, 2.0 * 0.99**(t / 100), decimals=4)
def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23] for fw in ["tf", "torch", None]: linear = from_config( LinearSchedule, { "schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6, "framework": fw }) if fw == "tf": tf.enable_eager_execution() for t in ts: out = linear(t) check(out, 2.1 - (t / 100) * (2.1 - 0.6), decimals=4)
def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23] config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6} for fw in ["tf", "torch", None]: linear = from_config(LinearSchedule, config, framework=fw) for t in ts: out = linear(t) check(out, 2.1 - (t / 100) * (2.1 - 0.6), decimals=4) # Test eager as well. with eager_mode(): linear = from_config(LinearSchedule, config, framework="tf") for t in ts: out = linear(t) check(out, 2.1 - (t / 100) * (2.1 - 0.6), decimals=4)
def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000] config = dict(type="ray.rllib.utils.schedules.polynomial_schedule." "PolynomialSchedule", schedule_timesteps=100, initial_p=2.0, final_p=0.5, power=2.0) for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" polynomial = from_config(config, framework=fw_) for t in ts: out = polynomial(t) t = min(t, 100) check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] for fw in ["tf", "torch", None]: polynomial = from_config( dict(type="ray.rllib.utils.schedules.polynomial_schedule." "PolynomialSchedule", schedule_timesteps=100, initial_p=2.0, final_p=0.5, power=2.0, framework=fw)) if fw == "tf": tf.enable_eager_execution() for t in ts: out = polynomial(t) check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000] expected = [2.1 - (min(t, 100) / 100) * (2.1 - 0.6) for t in ts] config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6} for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): linear = from_config(LinearSchedule, config, framework=fw) for t, e in zip(ts, expected): out = linear(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = linear(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_exponential_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] config = dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100) for fw in ["tf", "torch", None]: config["framework"] = fw exponential = from_config(ExponentialSchedule, config) for t in ts: out = exponential(t) check(out, 2.0 * 0.99**(t / 100), decimals=4) # Test eager as well. with eager_mode(): config["framework"] = "tf" exponential = from_config(ExponentialSchedule, config) for t in ts: out = exponential(t) check(out, 2.0 * 0.99**(t / 100), decimals=4)
def test_constant_schedule(self): value = 2.3 ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56] config = {"value": value} for fw in ["tf", "torch", None]: constant = from_config(ConstantSchedule, config, framework=fw) for t in ts: out = constant(t) check(out, value) # Test eager as well. with eager_mode(): constant = from_config(ConstantSchedule, config, framework="tf") for t in ts: out = constant(t) check(out, value)
def test_constant_schedule(self): value = 2.3 ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56] config = {"value": value} for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): constant = from_config(ConstantSchedule, config, framework=fw) for t in ts: out = constant(t) check(out, value) ts_as_tensors = self._get_framework_tensors(ts, fw) for t in ts_as_tensors: out = constant(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, value, decimals=4)
def test_piecewise_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0] config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5) for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): piecewise = from_config(PiecewiseSchedule, config, framework=fw) for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = piecewise(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_piecewise_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0] config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5) for fw in ["tf", "torch", None]: config["framework"] = fw piecewise = from_config(PiecewiseSchedule, config) for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4) # Test eager as well. with eager_mode(): config["framework"] = "tf" piecewise = from_config(PiecewiseSchedule, config) for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4)
def do_test_explorations( run, env, config, dummy_obs, prev_a=None, expected_mean_action=None ): """Calls an Agent's `compute_actions` with different `explore` options.""" core_config = config.copy() if run not in [a3c.A3CTrainer]: core_config["num_workers"] = 0 # Test all frameworks. for _ in framework_iterator(core_config): print("Agent={}".format(run)) # Test for both the default Agent's exploration AND the `Random` # exploration class. for exploration in [None, "Random"]: local_config = core_config.copy() if exploration == "Random": # TODO(sven): Random doesn't work for IMPALA yet. if run is impala.ImpalaTrainer: continue local_config["exploration_config"] = {"type": "Random"} print("exploration={}".format(exploration or "default")) trainer = run(config=local_config, env=env) # Make sure all actions drawn are the same, given same # observations. actions = [] for _ in range(25): actions.append( trainer.compute_single_action( observation=dummy_obs, explore=False, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None, ) ) check(actions[-1], actions[0]) # Make sure actions drawn are different # (around some mean value), given constant observations. actions = [] for _ in range(500): actions.append( trainer.compute_single_action( observation=dummy_obs, explore=True, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None, ) ) check( np.mean(actions), expected_mean_action if expected_mean_action is not None else 0.5, atol=0.4, ) # Check that the stddev is not 0.0 (values differ). check(np.std(actions), 0.0, false=True)
def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] config = dict(type="ray.rllib.utils.schedules.polynomial_schedule." "PolynomialSchedule", schedule_timesteps=100, initial_p=2.0, final_p=0.5, power=2.0) for fw in ["tf", "torch", None]: config["framework"] = fw polynomial = from_config(config) for t in ts: out = polynomial(t) check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4) # Test eager as well. with eager_mode(): config["framework"] = "tf" polynomial = from_config(config) for t in ts: out = polynomial(t) check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
def test_explorations(run, env, config, dummy_obs, prev_a=None, expected_mean_action=None): """Calls an Agent's `compute_actions` with different `explore` options.""" config = config.copy() if run not in [a3c.A3CTrainer]: config["num_workers"] = 0 # Test all frameworks. for fw in ["torch", "eager", "tf"]: if fw == "torch" and \ run in [dqn.DQNTrainer, dqn.SimpleQTrainer, impala.ImpalaTrainer, sac.SACTrainer]: continue print("Testing {} in framework={}".format(run, fw)) config["eager"] = True if fw == "eager" else False config["use_pytorch"] = True if fw == "torch" else False # Test for both the default Agent's exploration AND the `Random` # exploration class. for exploration in [None]: # , "Random"]: if exploration == "Random": config["exploration_config"] = {"type": "Random"} trainer = run(config=config, env=env) # Make sure all actions drawn are the same, given same # observations. actions = [] for _ in range(100): actions.append( trainer.compute_action( observation=dummy_obs, explore=False, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None)) check(actions[-1], actions[0]) # Make sure actions drawn are different (around some mean value), # given constant observations. actions = [] for _ in range(100): actions.append( trainer.compute_action( observation=dummy_obs, explore=True, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None)) check( np.mean(actions), expected_mean_action if expected_mean_action is not None else 0.5, atol=0.3) # Check that the stddev is not 0.0 (values differ). check(np.std(actions), 0.0, false=True)
def test_exponential_schedule(self): decay_rate = 0.2 ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] expected = [2.0 * decay_rate**(t / 100) for t in ts] config = dict(initial_p=2.0, decay_rate=decay_rate, schedule_timesteps=100) for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): exponential = from_config(ExponentialSchedule, config, framework=fw) for t, e in zip(ts, expected): out = exponential(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = exponential(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_no_curiosity(self): config = ppo.DEFAULT_CONFIG env = "CartPole-v0" dummy_obs = np.array([0.0, 0.1, 0.0, 0.0]) prev_a = np.array(0) config["framework"] = "torch" config["exploration_config"] = {"type": "ParameterNoise"} trainer = ppo.PPOTrainer(config=config, env=env) trainer.train() # Make sure all actions drawn are the same, given same # observations. Tests the explorations API. actions = [] for _ in range(5): actions.append( trainer.compute_action( observation=dummy_obs, explore=False, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None)) check(actions[-1], actions[0]) print(actions)
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["eager"] = True config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]) } # tf. trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() # Post-process (calculate simple (non-GAE) advantages) and attach to # train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch = pg.post_process_advantages(policy, train_batch) # Check Advantage values. check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. results = pg.pg_tf_loss(policy, policy.model, dist_class=Categorical, train_batch=train_batch) # Calculate expected results. expected_logits = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(), vars[1].numpy()), vars[2].numpy(), vars[3].numpy()) expected_logp = Categorical(expected_logits, policy.model).logp( train_batch[SampleBatch.ACTIONS]) expected_loss = -np.mean( expected_logp * train_batch[Postprocessing.ADVANTAGES]) check(results.numpy(), expected_loss, decimals=4) # Torch. config["use_pytorch"] = True trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() train_batch = policy._lazy_tensor_dict(train_batch) results = pg.pg_torch_loss(policy, policy.model, dist_class=TorchCategorical, train_batch=train_batch) expected_logits = policy.model.last_output() expected_logp = TorchCategorical(expected_logits, policy.model).logp( train_batch[SampleBatch.ACTIONS]) expected_loss = -np.mean( expected_logp.detach().numpy() * train_batch[Postprocessing.ADVANTAGES].numpy()) check(results.detach().numpy(), expected_loss, decimals=4)
def do_test_explorations(run, env, config, dummy_obs, prev_a=None, expected_mean_action=None): """Calls an Agent's `compute_actions` with different `explore` options.""" config = config.copy() if run not in [a3c.A3CTrainer]: config["num_workers"] = 0 # Test all frameworks. for fw in ["tf", "eager", "torch"]: if fw == "torch" and \ run in [ddpg.DDPGTrainer, dqn.DQNTrainer, dqn.SimpleQTrainer, impala.ImpalaTrainer, sac.SACTrainer, td3.TD3Trainer]: continue elif fw == "eager" and run in [ddpg.DDPGTrainer, td3.TD3Trainer]: continue print("Testing {} in framework={}".format(run, fw)) config["eager"] = fw == "eager" config["use_pytorch"] = fw == "torch" # Test for both the default Agent's exploration AND the `Random` # exploration class. for exploration in [None, "Random"]: if exploration == "Random": # TODO(sven): Random doesn't work for IMPALA yet. if run is impala.ImpalaTrainer: continue config["exploration_config"] = {"type": "Random"} print("exploration={}".format(exploration or "default")) eager_ctx = None if fw == "eager": eager_ctx = eager_mode() eager_ctx.__enter__() assert tf.executing_eagerly() elif fw == "tf": assert not tf.executing_eagerly() trainer = run(config=config, env=env) # Make sure all actions drawn are the same, given same # observations. actions = [] for _ in range(50): actions.append( trainer.compute_action( observation=dummy_obs, explore=False, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None)) check(actions[-1], actions[0]) # Make sure actions drawn are different # (around some mean value), given constant observations. actions = [] for _ in range(100): actions.append( trainer.compute_action( observation=dummy_obs, explore=True, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None)) check( np.mean(actions), expected_mean_action if expected_mean_action is not None else 0.5, atol=0.3) # Check that the stddev is not 0.0 (values differ). check(np.std(actions), 0.0, false=True) if eager_ctx: eager_ctx.__exit__(None, None, None)
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = SampleBatch({ SampleBatch.OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) for fw, sess in framework_iterator(config, session=True): dist_cls = (Categorical if fw != "torch" else TorchCategorical) trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() if sess: vars = policy.get_session().run(vars) # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch_ = pg.post_process_advantages(policy, train_batch.copy()) if fw == "torch": train_batch_ = policy._lazy_tensor_dict(train_batch_) # Check Advantage values. check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. if sess: results = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False)) else: results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(policy, policy.model, dist_class=dist_cls, train_batch=train_batch_) # Calculate expected results. if fw != "torch": expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), vars[2], vars[3], framework=fw) else: expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), vars[0], vars[1], framework=fw) expected_logp = dist_cls(expected_logits, policy.model).logp( train_batch_[SampleBatch.ACTIONS]) adv = train_batch_[Postprocessing.ADVANTAGES] if sess: expected_logp = sess.run(expected_logp) elif fw == "torch": expected_logp = expected_logp.detach().cpu().numpy() adv = adv.detach().cpu().numpy() else: expected_logp = expected_logp.numpy() expected_loss = -np.mean(expected_logp * adv) check(results, expected_loss, decimals=4)