예제 #1
0
    def test_dqn_compilation(self):
        """Test whether DQN can be built on all frameworks."""
        num_iterations = 1
        config = dqn.dqn.DQNConfig().rollouts(num_rollout_workers=2)

        for _ in framework_iterator(config, with_eager_tracing=True):
            # Double-dueling DQN.
            print("Double-dueling")
            plain_config = deepcopy(config)
            trainer = dqn.DQN(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)

            check_compute_single_action(trainer)
            trainer.stop()

            # Rainbow.
            print("Rainbow")
            rainbow_config = deepcopy(config).training(
                num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5
            )
            trainer = dqn.DQN(config=rainbow_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)

            check_compute_single_action(trainer)

            trainer.stop()
예제 #2
0
    def test_on_sub_environment_created(self):
        base_config = {
            "env": "CartPole-v1",
            # Create 4 sub-environments per remote worker.
            "num_envs_per_worker": 4,
            # Create 2 remote workers.
            "num_workers": 2,
        }

        for callbacks in (
            OnSubEnvironmentCreatedCallback,
            MultiCallbacks([OnSubEnvironmentCreatedCallback]),
        ):
            config = dict(base_config, callbacks=callbacks)

            for _ in framework_iterator(config, frameworks=("tf", "torch")):
                trainer = dqn.DQN(config=config)
                # Fake the counter on the local worker (doesn't have an env) and
                # set it to -1 so the below `foreach_worker()` won't fail.
                trainer.workers.local_worker().sum_sub_env_vector_indices = -1

                # Get sub-env vector index sums from the 2 remote workers:
                sum_sub_env_vector_indices = trainer.workers.foreach_worker(
                    lambda w: w.sum_sub_env_vector_indices
                )
                # Local worker has no environments -> Expect the -1 special
                # value returned by the above lambda.
                self.assertTrue(sum_sub_env_vector_indices[0] == -1)
                # Both remote workers (index 1 and 2) have a vector index counter
                # of 6 (sum of vector indices: 0 + 1 + 2 + 3).
                self.assertTrue(sum_sub_env_vector_indices[1] == 6)
                self.assertTrue(sum_sub_env_vector_indices[2] == 6)
                trainer.stop()
예제 #3
0
 def test_leaky_policy(self):
     """Tests, whether our diagnostics tools can detect leaks in a policy."""
     config = dqn.DEFAULT_CONFIG.copy()
     # Make sure we have an env to test on the local worker.
     # Otherwise, `check_memory_leaks` will complain.
     config["create_env_on_driver"] = True
     config["env"] = "CartPole-v0"
     config["multiagent"]["policies"] = {
         "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy),
     }
     trainer = dqn.DQN(config=config)
     results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300)
     assert results["policy"]
     trainer.stop()
예제 #4
0
def train_rllib_policy(config):
    """Trains a DQN on MsPacman-v0 for n iterations.

    Saves the trained Trainer to disk and returns the checkpoint path.

    Returns:
        str: The saved checkpoint to restore the trainer DQN from.
    """
    # Create trainer from config.
    trainer = dqn.DQN(config=config)

    # Train for n iterations, then save.
    for _ in range(args.train_iters):
        print(trainer.train())
    return trainer.save()
예제 #5
0
    def test_traj_view_normal_case(self):
        """Tests, whether Model and Policy return the correct ViewRequirements."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_envs_per_worker"] = 10
        config["rollout_fragment_length"] = 4

        for _ in framework_iterator(config):
            algo = dqn.DQN(
                config,
                env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv")
            policy = algo.get_policy()
            view_req_model = policy.model.view_requirements
            view_req_policy = policy.view_requirements
            print(_)
            print(view_req_policy)
            print(view_req_model)
            assert len(view_req_model) == 1, view_req_model
            assert len(view_req_policy) == 11, view_req_policy
            for key in [
                    SampleBatch.OBS,
                    SampleBatch.ACTIONS,
                    SampleBatch.REWARDS,
                    SampleBatch.DONES,
                    SampleBatch.NEXT_OBS,
                    SampleBatch.EPS_ID,
                    SampleBatch.AGENT_INDEX,
                    "weights",
            ]:
                assert key in view_req_policy
                # None of the view cols has a special underlying data_col,
                # except next-obs.
                if key != SampleBatch.NEXT_OBS:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].shift == 1
            rollout_worker = algo.workers.local_worker()
            sample_batch = rollout_worker.sample()
            expected_count = (config["num_envs_per_worker"] *
                              config["rollout_fragment_length"])
            assert sample_batch.count == expected_count
            for v in sample_batch.values():
                assert len(v) == expected_count
            algo.stop()
예제 #6
0
 def __init__(self, config, checkpoint_path):
     # Create the Trainer.
     self.trainer = dqn.DQN(config=config)
     # Load an already trained state for the trainer.
     self.trainer.restore(checkpoint_path)
def main():
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    env_config = {
        "num_candidates": args.env_num_candidates,
        "resample_documents": not args.env_dont_resample_documents,
        "slate_size": args.env_slate_size,
        "seed": args.env_seed,
        "convert_to_discrete_action_space": args.run == "DQN",
    }

    config = {
        "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution"
                else InterestExplorationRecSimEnv if args.env
                == "interest-exploration" else LongTermSatisfactionRecSimEnv),
        "framework":
        args.framework,
        "num_gpus":
        args.num_gpus,
        "num_workers":
        args.num_workers,
        "env_config":
        env_config,
        "replay_buffer_config": {
            "learning_starts": args.learning_starts,
        },
    }

    # Perform a test run on the env with a random agent to see, what
    # the random baseline reward is.
    if args.random_test_episodes:
        print(f"Running {args.random_test_episodes} episodes to get a random "
              "agent's baseline reward ...")
        env = config["env"](config=env_config)
        env.reset()
        num_episodes = 0
        episode_rewards = []
        episode_reward = 0.0
        while num_episodes < args.random_test_episodes:
            action = env.action_space.sample()
            _, r, d, _ = env.step(action)
            episode_reward += r
            if d:
                num_episodes += 1
                episode_rewards.append(episode_reward)
                episode_reward = 0.0
                env.reset()
        print(f"Ran {args.random_test_episodes} episodes with a random agent "
              "reaching a mean episode return of "
              f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.")

    if args.use_tune:
        stop = {
            "training_iteration": args.stop_iters,
            "timesteps_total": args.stop_timesteps,
            "episode_reward_mean": args.stop_reward,
        }

        results = tune.run(
            args.run,
            stop=stop,
            config=config,
            num_samples=args.tune_num_samples,
            verbose=2,
        )

        if args.as_test:
            check_learning_achieved(results, args.stop_reward)

    else:
        # Directly run using the trainer interface (good for debugging).
        if args.run == "DQN":
            trainer = dqn.DQN(config=config)
        else:
            trainer = slateq.SlateQTrainer(config=config)
        for i in range(10):
            result = trainer.train()
            print(pretty_print(result))
    ray.shutdown()
예제 #8
0
    def test_dqn_exploration_and_soft_q_config(self):
        """Tests, whether a DQN Agent outputs exploration/softmaxed actions."""
        config = (
            dqn.dqn.DQNConfig()
            .rollouts(num_rollout_workers=0)
            .environment(env_config={"is_slippery": False, "map_name": "4x4"})
        )
        obs = np.array(0)

        # Test against all frameworks.
        for _ in framework_iterator(config):
            # Default EpsilonGreedy setup.
            trainer = dqn.DQN(config=config, env="FrozenLake-v1")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_single_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_single_action(obs, explore=False)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for _ in range(50):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # Low softmax temperature. Behaves like argmax
            # (but no epsilon exploration).
            config.exploration(
                exploration_config={"type": "SoftQ", "temperature": 0.000001}
            )
            trainer = dqn.DQN(config=config, env="FrozenLake-v1")
            # Due to the low temp, always expect the same action.
            actions = [trainer.compute_single_action(obs)]
            for _ in range(50):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, decimals=3)
            trainer.stop()

            # Higher softmax temperature.
            config.exploration_config["temperature"] = 1.0
            trainer = dqn.DQN(config=config, env="FrozenLake-v1")

            # Even with the higher temperature, if we set explore=False, we
            # should expect the same actions always.
            a_ = trainer.compute_single_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_single_action(obs, explore=False)
                check(a, a_)

            # Due to the higher temp, expect different actions avg'ing
            # around 1.5.
            actions = []
            for _ in range(300):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # With Random exploration.
            config.exploration(exploration_config={"type": "Random"}, explore=True)
            trainer = dqn.DQN(config=config, env="FrozenLake-v1")
            actions = []
            for _ in range(300):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()