Пример #1
0
    def test_dqn_compilation(self):
        """Test whether a DQNTrainer can be built with both frameworks."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        num_iterations = 2

        for _ in framework_iterator(config, frameworks=["tf", "eager"]):
            # Rainbow.
            rainbow_config = config.copy()
            rainbow_config["num_atoms"] = 10
            rainbow_config["noisy"] = True
            rainbow_config["double_q"] = True
            rainbow_config["dueling"] = True
            rainbow_config["n_step"] = 5
            trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)

            # double-dueling DQN.
            plain_config = config.copy()
            trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
Пример #2
0
    def test_dqn_compilation(self):
        """Test whether a DQNTrainer can be built on all frameworks."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 2
        num_iterations = 1

        for _ in framework_iterator(config):
            # Double-dueling DQN.
            print("Double-dueling")
            plain_config = config.copy()
            trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)

            check_compute_single_action(trainer)
            trainer.stop()

            # Rainbow.
            print("Rainbow")
            rainbow_config = config.copy()
            rainbow_config["num_atoms"] = 10
            rainbow_config["noisy"] = True
            rainbow_config["double_q"] = True
            rainbow_config["dueling"] = True
            rainbow_config["n_step"] = 5
            trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)

            check_compute_single_action(trainer)
            trainer.stop()
Пример #3
0
    def test_dqn_compilation(self):
        """Test whether a DQNTrainer can be built on all frameworks."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        num_iterations = 2

        for fw in framework_iterator(config):
            # double-dueling DQN.
            plain_config = config.copy()
            trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)

            # Rainbow.
            # TODO(sven): Add torch once DQN-torch supports distributional-Q.
            if fw == "torch":
                continue
            rainbow_config = config.copy()
            rainbow_config["num_atoms"] = 10
            rainbow_config["noisy"] = True
            rainbow_config["double_q"] = True
            rainbow_config["dueling"] = True
            rainbow_config["n_step"] = 5
            trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
Пример #4
0
    def test_dqn_compilation(self):
        """Test whether a DQNTrainer can be built on all frameworks."""
        num_iterations = 1
        config = dqn.dqn.DQNConfig().rollouts(num_rollout_workers=2)

        for _ in framework_iterator(config, with_eager_tracing=True):
            # Double-dueling DQN.
            print("Double-dueling")
            plain_config = deepcopy(config)
            trainer = dqn.DQNTrainer(config=plain_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)

            check_compute_single_action(trainer)
            trainer.stop()

            # Rainbow.
            print("Rainbow")
            rainbow_config = deepcopy(config).training(num_atoms=10,
                                                       noisy=True,
                                                       double_q=True,
                                                       dueling=True,
                                                       n_step=5)
            trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)

            check_compute_single_action(trainer)

            trainer.stop()
Пример #5
0
def main(params):    
    for (key, value) in params.items():
        print("Parameter {} is set to {}".format(key, value))
        
    if not params["use_gym_env"]:
        register_env(params["env_name"], get_env_creator(params["env_name"]))
        
    if params["model"] == "DQN":
        print(3)
        from ray.rllib.agents import dqn
        ray.init()
        print(4)
        
        config = dqn.DEFAULT_CONFIG.copy()
        config["framework"] = params["framework"]
        env = str(params["env_name"])
        print(5)
        
        trainer = dqn.DQNTrainer(config=config, env=env)
        print(6)
        for i in range(100):
            print(trainer.train()['episode_reward_mean'])
    if params["model"] == "PPO":
        ray.init()
        ModelCatalog.register_custom_model("my_model", TorchCustomModel)
        trainer = get_trainer_from_params(params)

        if params["train"]:
            for i in range(params['num_training_iters']):
                print("starting training iteration {}".format(i))
                trainer.train()
                if i == params['num_training_iters'] - 1:
                    checkpoint_path = trainer.save()
                    print(checkpoint_path)
Пример #6
0
    def test_evaluation_option(self):
        config = dqn.DEFAULT_CONFIG.copy()
        config.update({
            "env": "CartPole-v0",
            "evaluation_interval": 2,
            "evaluation_num_episodes": 2,
            "evaluation_config": {
                "gamma": 0.98,
            },
            # Use a custom callback that asserts that we are running the
            # configured exact number of episodes per evaluation.
            "callbacks": AssertNumEvalEpisodesCallback,
        })

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = dqn.DQNTrainer(config=config)
            # Given evaluation_interval=2, r0, r2, r4 should not contain
            # evaluation metrics, while r1, r3 should.
            r0 = trainer.train()
            print(r0)
            r1 = trainer.train()
            print(r1)
            r2 = trainer.train()
            print(r2)
            r3 = trainer.train()
            print(r3)
            trainer.stop()

            self.assertFalse("evaluation" in r0)
            self.assertTrue("evaluation" in r1)
            self.assertFalse("evaluation" in r2)
            self.assertTrue("evaluation" in r3)
            self.assertTrue("episode_reward_mean" in r1["evaluation"])
            self.assertNotEqual(r1["evaluation"], r3["evaluation"])
Пример #7
0
def create_agent(args):
    """Create DQN agent.

    Args:
        args (argparse.Namespace): argparse arguments.

    Returns:
        agent (ray.rllib.agents.trainer_template.DQN): DQN agent.
    """
    # Custom configuration
    config = dqn.DEFAULT_CONFIG.copy()
    config["double_q"] = True
    config["dueling"] = True
    config["framework"] = "torch"
    config["horizon"] = 1150
    config["num_gpus"] = 1
    config["num_workers"] = 19
    config["train_batch_size"] = 128

    # Agent creation
    agent = dqn.DQNTrainer(env=MissileCommand, config=config)

    # To optionally load a checkpoint
    if args.checkpoint:
        agent.restore(args.checkpoint)

    # Print model
    if args.verbose > 0:
        model = agent.get_policy().model
        if config["framework"] == "tf":
            print(type(model.base_model.summary()))
        elif config["framework"] == "torch":
            print(model)

    return agent
Пример #8
0
 def test_traj_view_normal_case(self):
     """Tests, whether Model and Policy return the correct ViewRequirements.
     """
     config = dqn.DEFAULT_CONFIG.copy()
     for _ in framework_iterator(config):
         trainer = dqn.DQNTrainer(
             config,
             env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv")
         policy = trainer.get_policy()
         view_req_model = policy.model.inference_view_requirements
         view_req_policy = policy.view_requirements
         assert len(view_req_model) == 1, view_req_model
         assert len(view_req_policy) == 8, view_req_policy
         for key in [
                 SampleBatch.OBS,
                 SampleBatch.ACTIONS,
                 SampleBatch.REWARDS,
                 SampleBatch.DONES,
                 SampleBatch.NEXT_OBS,
                 SampleBatch.EPS_ID,
                 SampleBatch.AGENT_INDEX,
                 "weights",
         ]:
             assert key in view_req_policy
             # None of the view cols has a special underlying data_col,
             # except next-obs.
             if key != SampleBatch.NEXT_OBS:
                 assert view_req_policy[key].data_col is None
             else:
                 assert view_req_policy[key].data_col == SampleBatch.OBS
                 assert view_req_policy[key].shift == 1
         trainer.stop()
Пример #9
0
    def test_evaluation_option_always_attach_eval_metrics(self):
        config = dqn.DEFAULT_CONFIG.copy()
        config.update(
            {
                "env": "CartPole-v0",
                "evaluation_interval": 2,
                "evaluation_duration": 2,
                "evaluation_duration_unit": "episodes",
                "evaluation_config": {
                    "gamma": 0.98,
                },
                "always_attach_evaluation_results": True,
                # Use a custom callback that asserts that we are running the
                # configured exact number of episodes per evaluation.
                "callbacks": AssertEvalCallback,
            }
        )

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = dqn.DQNTrainer(config=config)
            # Should always see latest available eval results.
            r0 = trainer.train()
            r1 = trainer.train()
            r2 = trainer.train()
            r3 = trainer.train()
            trainer.stop()

            # Eval results are not available at step 0.
            # But step 3 should still have it, even though no eval was
            # run during that step.
            self.assertTrue("evaluation" in r0)
            self.assertTrue("evaluation" in r1)
            self.assertTrue("evaluation" in r2)
            self.assertTrue("evaluation" in r3)
Пример #10
0
    def test_evaluation_option(self):
        config = dqn.DEFAULT_CONFIG.copy()
        config.update({
            "env": "CartPole-v0",
            "evaluation_interval": 2,
            "evaluation_num_episodes": 2,
            "evaluation_config": {
                "gamma": 0.98,
            }
        })

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = dqn.DQNTrainer(config=config)
            # Given evaluation_interval=2, r0, r2, r4 should not contain
            # evaluation metrics, while r1, r3 should.
            r0 = trainer.train()
            print(r0)
            r1 = trainer.train()
            print(r1)
            r2 = trainer.train()
            print(r2)
            r3 = trainer.train()
            print(r3)
            trainer.stop()

            self.assertFalse("evaluation" in r0)
            self.assertTrue("evaluation" in r1)
            self.assertFalse("evaluation" in r2)
            self.assertTrue("evaluation" in r3)
            self.assertTrue("episode_reward_mean" in r1["evaluation"])
            self.assertNotEqual(r1["evaluation"], r3["evaluation"])
Пример #11
0
    def test_dqn_fake_multi_gpu_learning(self):
        """Test whether DQNTrainer can learn CartPole w/ faked multi-GPU."""
        config = copy.deepcopy(dqn.DEFAULT_CONFIG)

        # Fake GPU setup.
        config["num_gpus"] = 2
        config["_fake_gpus"] = True

        # Double batch size (2 GPUs).
        config["train_batch_size"] = 64
        # Mimic tuned_example for DQN CartPole.
        config["n_step"] = 3
        config["model"]["fcnet_hiddens"] = [64]
        config["model"]["fcnet_activation"] = "linear"

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
            num_iterations = 200
            learnt = False
            for i in range(num_iterations):
                results = trainer.train()
                print("reward={}".format(results["episode_reward_mean"]))
                if results["episode_reward_mean"] > 65.0:
                    learnt = True
                    break
            assert learnt, \
                "DQN multi-GPU (with fake-GPUs) did not learn CartPole!"
            trainer.stop()
Пример #12
0
def get_dqn_car_trainer():
    ModelCatalog.register_custom_model("my_model", TorchCustomModel)
    config = {
        "env": StoppingCar,  #
        "model": {
            "custom_model": "my_model",
            "fcnet_hiddens": [16],
            "fcnet_activation": "relu"
        },  # model config,

        # "vf_share_layers": False,  # try different lrs
        # "vf_clip_param": 100,
        "lr": 0.001,
        # "clip_rewards": False,  # 500*1000,
        "grad_clip": 2500,
        # "worker_side_prioritization": True,
        "num_workers": 8,  # parallelism
        # "batch_mode": "complete_episodes",
        "batch_mode": "truncate_episodes",
        "rollout_fragment_length": 2000,
        "num_envs_per_worker": 10,
        "train_batch_size": 4000,
        "hiddens": [16],
        "framework": "torch",
        "horizon": 8000,
        "evaluation_config": {
            # Example: overriding env_config, exploration, etc:
            # "env_config": {...},
            "explore": False
        },
    }
    trainer = dqn.DQNTrainer(config=config)
    return trainer, config
Пример #13
0
    def test_on_sub_environment_created_with_remote_envs(self):
        config = {
            "env": "CartPole-v1",
            # Make each sub-environment a ray actor.
            "remote_worker_envs": True,
            # Create 4 sub-environments (ray remote actors) per remote
            # worker.
            "num_envs_per_worker": 4,
            # Create 2 remote workers.
            "num_workers": 2,
            "callbacks": OnSubEnvironmentCreatedCallback,
        }

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = dqn.DQNTrainer(config=config)
            # Fake the counter on the local worker (doesn't have an env) and
            # set it to -1 so the below `foreach_worker()` won't fail.
            trainer.workers.local_worker().sum_sub_env_vector_indices = -1

            # Get sub-env vector index sums from the 2 remote workers:
            sum_sub_env_vector_indices = trainer.workers.foreach_worker(
                lambda w: w.sum_sub_env_vector_indices)
            # Local worker has no environments -> Expect the -1 special
            # value returned by the above lambda.
            self.assertTrue(sum_sub_env_vector_indices[0] == -1)
            # Both remote workers (index 1 and 2) have a vector index counter
            # of 6 (sum of vector indices: 0 + 1 + 2 + 3).
            self.assertTrue(sum_sub_env_vector_indices[1] == 6)
            self.assertTrue(sum_sub_env_vector_indices[2] == 6)
            trainer.stop()
Пример #14
0
def trainDqn(numIter):
    """
	train
	"""
    ray.shutdown()
    ray.init()
    config = createConfig()
    trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv)
    for i in range(numIter):
        print("\n**** next iteration " + str(i))
        HiLoPricingEnv.count = 0
        result = trainer.train()
        print(pretty_print(result))
        print("env reset count " + str(HiLoPricingEnv.count))

    policy = trainer.get_policy()
    weights = policy.get_weights()
    #print("policy weights")
    #print(weights)

    model = policy.model
    #summary = model.base_model.summary()
    #print("model summary")
    #print(weights)

    return trainer
Пример #15
0
    def __new__(cls, config={}):

        name = config.pop('agent', None)
        if name == "DQN":
            return dqn.DQNTrainer(config=config)
        elif name == "PPO":
            return ppo.APPOTrainer(config=config)
        else:
            raise Exception("{} agent is not supported".format(name))
Пример #16
0
    def test_dqn_compilation(self):
        """Test whether a DQNTrainer can be built with both frameworks."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.

        # tf.
        config["eager"] = True
        trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
        num_iterations = 2
        for i in range(num_iterations):
            results = trainer.train()
            print(results)

        config["eager"] = False
        trainer = dqn.DQNTrainer(config=config, env="CartPole-v0")
        num_iterations = 2
        for i in range(num_iterations):
            results = trainer.train()
            print(results)
Пример #17
0
def loadTrainer(path):
    """
	load trainer from checkpoint
	"""
    ray.shutdown()
    ray.init()
    config = createConfig()
    trainer = dqn.DQNTrainer(config=config, env=HiLoPricingEnv)
    trainer.restore(path)
    return trainer
Пример #18
0
    def test_dqn_compilation(self):
        """Test whether a DQNTrainer can be built with both frameworks."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.

        # Rainbow.
        rainbow_config = config.copy()
        rainbow_config["eager"] = False
        rainbow_config["num_atoms"] = 10
        rainbow_config["noisy"] = True
        rainbow_config["double_q"] = True
        rainbow_config["dueling"] = True
        rainbow_config["n_step"] = 5
        trainer = dqn.DQNTrainer(config=rainbow_config, env="CartPole-v0")
        num_iterations = 2
        for i in range(num_iterations):
            results = trainer.train()
            print(results)

        # tf.
        tf_config = config.copy()
        tf_config["eager"] = False
        trainer = dqn.DQNTrainer(config=tf_config, env="CartPole-v0")
        num_iterations = 1
        for i in range(num_iterations):
            results = trainer.train()
            print(results)

        # Eager.
        eager_config = config.copy()
        eager_config["eager"] = True
        eager_ctx = eager_mode()
        eager_ctx.__enter__()
        trainer = dqn.DQNTrainer(config=eager_config, env="CartPole-v0")
        num_iterations = 1
        for i in range(num_iterations):
            results = trainer.train()
            print(results)
        eager_ctx.__exit__(None, None, None)
Пример #19
0
 def test_leaky_policy(self):
     """Tests, whether our diagnostics tools can detect leaks in a policy."""
     config = dqn.DEFAULT_CONFIG.copy()
     # Make sure we have an env to test on the local worker.
     # Otherwise, `check_memory_leaks` will complain.
     config["create_env_on_driver"] = True
     config["env"] = "CartPole-v0"
     config["multiagent"]["policies"] = {
         "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy),
     }
     trainer = dqn.DQNTrainer(config=config)
     results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300)
     assert results["policy"]
     trainer.stop()
Пример #20
0
    def _discrete_run(self):

        from ray import tune
        from ray.rllib.agents import dqn
        from ray.rllib.agents.dqn import DEFAULT_CONFIG
        DEFAULT_CONFIG['framework']='torch'
        if self.configs['mode']:
            DEFAULT_CONFIG['double_q']=False
        else:
            DEFAULT_CONFIG['double_q']=True
        import ray
        AGENT_CONFIG={'dqn':dqn.DQNTrainer,
        'ddqn':dqn.DQNTrainer(config=DEFAULT_CONFIG,env='CartPole-v0')}
        agent=AGENT_CONFIG[self.configs['algorithm']]
        tune.run(agent, config={"env": "CartPole-v0","framework":"torch"})
Пример #21
0
def train_rllib_policy(config):
    """Trains a DQNTrainer on MsPacman-v0 for n iterations.

    Saves the trained Trainer to disk and returns the checkpoint path.

    Returns:
        str: The saved checkpoint to restore the trainer DQNTrainer from.
    """
    # Create trainer from config.
    trainer = dqn.DQNTrainer(config=config)

    # Train for n iterations, then save.
    for _ in range(args.train_iters):
        print(trainer.train())
    return trainer.save()
Пример #22
0
def get_rl_agent(agent_name, config, env_to_agent):
    if agent_name == A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    else:
        raise Exception("Not valid agent name")
    return agent
Пример #23
0
    def test_traj_view_normal_case(self):
        """Tests, whether Model and Policy return the correct ViewRequirements.
        """
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_envs_per_worker"] = 10
        config["rollout_fragment_length"] = 4

        for _ in framework_iterator(config):
            trainer = dqn.DQNTrainer(
                config,
                env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv")
            policy = trainer.get_policy()
            view_req_model = policy.model.inference_view_requirements
            view_req_policy = policy.view_requirements
            assert len(view_req_model) == 1, view_req_model
            assert len(view_req_policy) == 8, view_req_policy
            for key in [
                    SampleBatch.OBS,
                    SampleBatch.ACTIONS,
                    SampleBatch.REWARDS,
                    SampleBatch.DONES,
                    SampleBatch.NEXT_OBS,
                    SampleBatch.EPS_ID,
                    SampleBatch.AGENT_INDEX,
                    "weights",
            ]:
                assert key in view_req_policy
                # None of the view cols has a special underlying data_col,
                # except next-obs.
                if key != SampleBatch.NEXT_OBS:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
                    assert view_req_policy[key].data_rel_pos == 1
            rollout_worker = trainer.workers.local_worker()
            sample_batch = rollout_worker.sample()
            expected_count = \
                config["num_envs_per_worker"] * \
                config["rollout_fragment_length"]
            assert sample_batch.count == expected_count
            for v in sample_batch.data.values():
                assert len(v) == expected_count
            trainer.stop()
Пример #24
0
def get_rllib_agent(agent_name, env_name, env, env_to_agent):
    config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {}
    if agent_name == RLLIB_A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    return agent
Пример #25
0
def create_agent(args):
    """Create XXX agent.

    Args:
        args (argparse.Namespace): argparse arguments.

    Returns:
        agent (ray.rllib.agents.trainer_template.XXX): XXX agent.
    """
    # Custom configuration
    config = dqn.DEFAULT_CONFIG.copy()
    config["double_q"] = True
    config["dueling"] = True
    config["framework"] = "torch"
    config["lr"] = 5e-4
    config["num_gpus"] = 1
    config["num_workers"] = 1
    config["train_batch_size"] = 128

    # Custom model
    config["model"]["fcnet_activation"] = "relu"
    config["model"]["fcnet_hiddens"] = [64, 64]

    # Agent creation
    agent = dqn.DQNTrainer(env=GymEnv, config=config)

    # To optionally load a checkpoint
    if args.checkpoint:
        agent.restore(args.checkpoint)

    # Print model
    if args.verbose > 0:
        model = agent.get_policy().model
        if config["framework"] == "tf":
            print(type(model.base_model.summary()))
        elif config["framework"] == "torch":
            print(model)

    return agent
Пример #26
0
    def test_dqn_exploration_and_soft_q_config(self):
        """Tests, whether a DQN Agent outputs exploration/softmaxed actions."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs = np.array(0)

        # Test against all frameworks.
        for _ in framework_iterator(config):
            # Default EpsilonGreedy setup.
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for _ in range(50):
                actions.append(trainer.compute_action(obs))
            check(np.std(actions), 0.0, false=True)

            # Low softmax temperature. Behaves like argmax
            # (but no epsilon exploration).
            config["exploration_config"] = {
                "type": "SoftQ",
                "temperature": 0.000001
            }
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            # Due to the low temp, always expect the same action.
            actions = [trainer.compute_action(obs)]
            for _ in range(50):
                actions.append(trainer.compute_action(obs))
            check(np.std(actions), 0.0, decimals=3)

            # Higher softmax temperature.
            config["exploration_config"]["temperature"] = 1.0
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")

            # Even with the higher temperature, if we set explore=False, we
            # should expect the same actions always.
            a_ = trainer.compute_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)

            # Due to the higher temp, expect different actions avg'ing
            # around 1.5.
            actions = []
            for _ in range(300):
                actions.append(trainer.compute_action(obs))
            check(np.std(actions), 0.0, false=True)

            # With Random exploration.
            config["exploration_config"] = {"type": "Random"}
            config["explore"] = True
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            actions = []
            for _ in range(300):
                actions.append(trainer.compute_action(obs))
            check(np.std(actions), 0.0, false=True)
Пример #27
0
    def test_dqn_parameter_noise_exploration(self):
        """Tests, whether a DQN Agent works with ParameterNoise."""
        obs = np.array(0)
        core_config = dqn.DEFAULT_CONFIG.copy()
        core_config["num_workers"] = 0  # Run locally.
        core_config["env_config"] = {"is_slippery": False, "map_name": "4x4"}

        # Test against all frameworks.
        for fw in framework_iterator(core_config):
            config = core_config.copy()

            # DQN with ParameterNoise exploration (config["explore"]=True).
            # ----
            config["exploration_config"] = {"type": "ParameterNoise"}
            config["explore"] = True

            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            policy = trainer.get_policy()
            p_sess = getattr(policy, "_sess", None)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise_before = self._get_current_noise(policy, fw)
            check(noise_before, 0.0)
            initial_weights = self._get_current_weight(policy, fw)

            # Pseudo-start an episode and compare the weights before and after.
            policy.exploration.on_episode_start(policy, tf_sess=p_sess)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise_after_ep_start = self._get_current_noise(policy, fw)
            weights_after_ep_start = self._get_current_weight(policy, fw)
            # Should be the same, as we don't do anything at the beginning of
            # the episode, only one step later.
            check(noise_after_ep_start, noise_before)
            check(initial_weights, weights_after_ep_start)

            # Setting explore=False should always return the same action.
            a_ = trainer.compute_action(obs, explore=False)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise = self._get_current_noise(policy, fw)
            # We sampled the first noise (not zero anymore).
            check(noise, 0.0, false=True)
            # But still not applied b/c explore=False.
            check(self._get_current_weight(policy, fw), initial_weights)
            for _ in range(10):
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)
                # Noise never gets applied.
                check(self._get_current_weight(policy, fw), initial_weights)
                self.assertFalse(
                    policy.exploration.weights_are_currently_noisy)

            # Explore=None (default: True) should return different actions.
            # However, this is only due to the underlying epsilon-greedy
            # exploration.
            actions = []
            current_weight = None
            for _ in range(10):
                actions.append(trainer.compute_action(obs))
                self.assertTrue(policy.exploration.weights_are_currently_noisy)
                # Now, noise actually got applied (explore=True).
                current_weight = self._get_current_weight(policy, fw)
                check(current_weight, initial_weights, false=True)
                check(current_weight, initial_weights + noise)
            check(np.std(actions), 0.0, false=True)

            # Pseudo-end the episode and compare weights again.
            # Make sure they are the original ones.
            policy.exploration.on_episode_end(policy, tf_sess=p_sess)
            weights_after_ep_end = self._get_current_weight(policy, fw)
            check(current_weight - noise, weights_after_ep_end, decimals=5)

            # DQN with ParameterNoise exploration (config["explore"]=False).
            # ----
            config = core_config.copy()
            config["exploration_config"] = {"type": "ParameterNoise"}
            config["explore"] = False
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            policy = trainer.get_policy()
            p_sess = getattr(policy, "_sess", None)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            initial_weights = self._get_current_weight(policy, fw)

            # Noise before anything (should be 0.0, no episode started yet).
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0)

            # Pseudo-start an episode and compare the weights before and after
            # (they should be the same).
            policy.exploration.on_episode_start(policy, tf_sess=p_sess)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)

            # Should be the same, as we don't do anything at the beginning of
            # the episode, only one step later.
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0)
            noisy_weights = self._get_current_weight(policy, fw)
            check(initial_weights, noisy_weights)

            # Setting explore=False or None should always return the same
            # action.
            a_ = trainer.compute_action(obs, explore=False)
            # Now we have re-sampled.
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0, false=True)
            for _ in range(5):
                a = trainer.compute_action(obs, explore=None)
                check(a, a_)
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)

            # Pseudo-end the episode and compare weights again.
            # Make sure they are the original ones (no noise permanently
            # applied throughout the episode).
            policy.exploration.on_episode_end(policy, tf_sess=p_sess)
            weights_after_episode_end = self._get_current_weight(policy, fw)
            check(initial_weights, weights_after_episode_end)
            # Noise should still be the same (re-sampling only happens at
            # beginning of episode).
            noise_after = self._get_current_noise(policy, fw)
            check(noise, noise_after)

            # Switch off EpsilonGreedy underlying exploration.
            # ----
            config = core_config.copy()
            config["exploration_config"] = {
                "type": "ParameterNoise",
                "sub_exploration": {
                    "type": "EpsilonGreedy",
                    "action_space": trainer.get_policy().action_space,
                    "initial_epsilon": 0.0,  # <- no randomness whatsoever
                }
            }
            config["explore"] = True
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            # Now, when we act - even with explore=True - we would expect
            # the same action for the same input (parameter noise is
            # deterministic).
            policy = trainer.get_policy()
            p_sess = getattr(policy, "_sess", None)
            policy.exploration.on_episode_start(policy, tf_sess=p_sess)
            a_ = trainer.compute_action(obs)
            for _ in range(10):
                a = trainer.compute_action(obs, explore=True)
                check(a, a_)
Пример #28
0
def main():
    args = parser.parse_args()
    ray.init()

    if args.agent not in ["DQN", "SlateQ"]:
        raise ValueError(args.agent)

    env_config = {
        "slate_size": args.env_slate_size,
        "seed": args.env_seed,
        "convert_to_discrete_action_space": args.agent == "DQN",
    }

    if args.use_tune:
        time_signature = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
        name = f"SlateQ/{args.agent}-seed{args.env_seed}-{time_signature}"
        if args.agent == "DQN":
            tune.run(
                "DQN",
                stop={"timesteps_total": 4000000},
                name=name,
                config={
                    "env": recsim_env_name,
                    "num_gpus": args.num_gpus,
                    "num_workers": args.num_workers,
                    "env_config": env_config,
                },
                num_samples=args.tune_num_samples,
                verbose=1,
            )
        else:
            tune.run(
                "SlateQ",
                stop={"timesteps_total": 4000000},
                name=name,
                config={
                    "env": recsim_env_name,
                    "num_gpus": args.num_gpus,
                    "num_workers": args.num_workers,
                    "slateq_strategy": tune.grid_search(ALL_SLATEQ_STRATEGIES),
                    "env_config": env_config,
                },
                num_samples=args.tune_num_samples,
                verbose=1,
            )
    else:
        # directly run using the trainer interface (good for debugging)
        if args.agent == "DQN":
            config = dqn.DEFAULT_CONFIG.copy()
            config["num_gpus"] = 0
            config["num_workers"] = 0
            config["env_config"] = env_config
            trainer = dqn.DQNTrainer(config=config, env=recsim_env_name)
        else:
            config = slateq.DEFAULT_CONFIG.copy()
            config["num_gpus"] = 0
            config["num_workers"] = 0
            config["slateq_strategy"] = args.strategy
            config["env_config"] = env_config
            trainer = slateq.SlateQTrainer(config=config, env=recsim_env_name)
        for i in range(10):
            result = trainer.train()
            print(pretty_print(result))
    ray.shutdown()
Пример #29
0
def main():
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    env_config = {
        "num_candidates": args.env_num_candidates,
        "resample_documents": not args.env_dont_resample_documents,
        "slate_size": args.env_slate_size,
        "seed": args.env_seed,
        "convert_to_discrete_action_space": args.run == "DQN",
    }

    config = {
        "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution"
                else InterestExplorationRecSimEnv if args.env
                == "interest-exploration" else LongTermSatisfactionRecSimEnv),
        "framework":
        args.framework,
        "num_gpus":
        args.num_gpus,
        "num_workers":
        args.num_workers,
        "env_config":
        env_config,
        "learning_starts":
        args.learning_starts,
    }

    # Perform a test run on the env with a random agent to see, what
    # the random baseline reward is.
    if args.random_test_episodes:
        print(f"Running {args.random_test_episodes} episodes to get a random "
              "agent's baseline reward ...")
        env = config["env"](config=env_config)
        env.reset()
        num_episodes = 0
        episode_rewards = []
        episode_reward = 0.0
        while num_episodes < args.random_test_episodes:
            action = env.action_space.sample()
            _, r, d, _ = env.step(action)
            episode_reward += r
            if d:
                num_episodes += 1
                episode_rewards.append(episode_reward)
                episode_reward = 0.0
                env.reset()
        print(f"Ran {args.random_test_episodes} episodes with a random agent "
              "reaching a mean episode return of "
              f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.")

    if args.use_tune:
        stop = {
            "training_iteration": args.stop_iters,
            "timesteps_total": args.stop_timesteps,
            "episode_reward_mean": args.stop_reward,
        }

        if args.run == "SlateQ":
            config.update({
                "slateq_strategy": args.slateq_strategy,
            })
        results = tune.run(
            args.run,
            stop=stop,
            config=config,
            num_samples=args.tune_num_samples,
            verbose=2,
        )

        if args.as_test:
            check_learning_achieved(results, args.stop_reward)

    else:
        # Directly run using the trainer interface (good for debugging).
        if args.run == "DQN":
            trainer = dqn.DQNTrainer(config=config)
        else:
            config.update({
                "slateq_strategy": args.slateq_strategy,
            })
            trainer = slateq.SlateQTrainer(config=config)
        for i in range(10):
            result = trainer.train()
            print(pretty_print(result))
    ray.shutdown()
Пример #30
0
mc = max(checkpoint_numbers)
checkpoint_path = path_to_results+"/"+"checkpoint_{}/checkpoint-{}".format(mc,mc)
print("found {} checkpoints".format(len(checkpoint_numbers)))
print("restoring "+checkpoint_path)

# ============================================================== #
# evaluation {{{
# ============================================================== #
#ray.init()
ray.init(temp_dir=tmpdir+"/ray")  # you may need to change the temp directory in case it runs on a cluster or shared machine

if config["optimizer_class"] == "AsyncReplayOptimizer":
    trainer = dqn.ApexTrainer(config=config, env=CodeEnv)
else:
    trainer = dqn.DQNTrainer(config=config, env=CodeEnv)
trainer.restore(checkpoint_path)
env = CodeEnv(env_config)
n = env.n

dB_len = len(dB_range)
BitErr = np.zeros([dB_len], dtype=int)
CwErr = np.zeros([dB_len], dtype=int)
totCw = np.zeros([dB_len], dtype=int)
totBit = np.zeros([dB_len], dtype=int)

for i in range(dB_len):
    print("\n--------\nSimulating EbNo = {} dB".format(dB_range[i]))
    env.set_EbNo_dB(dB_range[i])

    while(CwErr[i]<minCwErr and totCw[i]+1<=maxCw):