Пример #1
0
 def check_learned(self):
     """
     check the learned agent
     """
     ray.init(local_mode=True)
     if self.algorithm == 'PPO':
         agent = ppo.PPOTrainer(config=self.ray_config,
                                env=self.env.__class__)
     elif self.algorithm == 'A3C':
         agent = a3c.A3CTrainer(config=self.ray_config,
                                env=self.env.__class__)
     elif self.algorithm == 'PG':
         agent = pg.PGTrainer(config=self.ray_config,
                              env=self.env.__class__)
     agent.restore(self.checkpoint_path)
     # run until episode ends
     episode_reward = 0
     done = False
     obs = self.env.reset()
     while True:
         self.env.render()
         action = agent.compute_action(obs)
         obs, reward, done, info = self.env.step(action)
         # print(f"obs:\n{obs}")
         print(f"reward:\n{reward}")
         print(f"info:\n{info}")
         episode_reward += reward
Пример #2
0
    def test_add_delete_policy(self):
        env = gym.make("CartPole-v0")

        config = pg.DEFAULT_CONFIG.copy()
        config.update({
            "env": MultiAgentCartPole,
            "env_config": {
                "config": {
                    "num_agents": 4,
                },
            },
            "multiagent": {
                # Start with a single policy.
                "policies": {
                    "p0": (None, env.observation_space, env.action_space, {}),
                },
                "policy_mapping_fn": lambda aid, episode, **kwargs: "p0",
                "policy_map_capacity": 2,
            },
        })

        # TODO: (sven) this will work for tf, once we have the DynamicTFPolicy
        #  refactor PR merged.
        for _ in framework_iterator(config, frameworks=("tf2", "torch")):
            trainer = pg.PGTrainer(config=config)
            r = trainer.train()
            self.assertTrue("p0" in r["policy_reward_min"])
            for i in range(1, 4):

                def new_mapping_fn(agent_id, episode, **kwargs):
                    return f"p{choice([i, i - 1])}"

                # Add a new policy.
                new_pol = trainer.add_policy(
                    f"p{i}",
                    trainer._policy_class,
                    observation_space=env.observation_space,
                    action_space=env.action_space,
                    config={},
                    # Test changing the mapping fn.
                    policy_mapping_fn=new_mapping_fn,
                    # Change the list of policies to train.
                    policies_to_train=[f"p{i}", f"p{i-1}"],
                )
                pol_map = trainer.workers.local_worker().policy_map
                self.assertTrue(new_pol is not trainer.get_policy("p0"))
                for j in range(i):
                    self.assertTrue(f"p{j}" in pol_map)
                self.assertTrue(len(pol_map) == i + 1)
                r = trainer.train()
                self.assertTrue("p1" in r["policy_reward_min"])

            # Delete all added policies again from trainer.
            for i in range(3, 0, -1):
                trainer.remove_policy(
                    f"p{i}",
                    policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}",
                    policies_to_train=[f"p{i - 1}"])

            trainer.stop()
Пример #3
0
    def test_pg_fake_multi_gpu_learning(self):
        """Test whether PGTrainer can learn CartPole w/ faked multi-GPU."""
        config = copy.deepcopy(pg.DEFAULT_CONFIG)

        # Fake GPU setup.
        config["num_gpus"] = 2
        config["_fake_gpus"] = True

        config["framework"] = "tf"
        # Mimic tuned_example for PG CartPole.
        config["model"]["fcnet_hiddens"] = [64]
        config["model"]["fcnet_activation"] = "linear"

        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        num_iterations = 200
        learnt = False
        for i in range(num_iterations):
            results = trainer.train()
            print("reward={}".format(results["episode_reward_mean"]))
            # Make this test quite short (75.0).
            if results["episode_reward_mean"] > 75.0:
                learnt = True
                break
        assert learnt, "PG multi-GPU (with fake-GPUs) did not learn CartPole!"
        trainer.stop()
Пример #4
0
    def test_timesteps(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["model"]["fcnet_hiddens"] = [1]
        config["model"]["fcnet_activation"] = None

        obs = np.array(1)
        obs_batch = np.array([1])

        for _ in framework_iterator(config):
            trainer = pg.PGTrainer(config=config, env=RandomEnv)
            policy = trainer.get_policy()

            for i in range(1, 21):
                trainer.compute_single_action(obs)
                self.assertEqual(policy.global_timestep, i)
            for i in range(1, 21):
                policy.compute_actions(obs_batch)
                self.assertEqual(policy.global_timestep, i + 20)

            # Artificially set ts to 100Bio, then keep computing actions and
            # train.
            crazy_timesteps = int(1e11)
            policy.global_timestep = crazy_timesteps
            # Run for 10 more ts.
            for i in range(1, 11):
                policy.compute_actions(obs_batch)
                self.assertEqual(policy.global_timestep, i + crazy_timesteps)
            trainer.train()
Пример #5
0
 def test_pg_exec_impl(ray_start_regular):
     trainer = pg.PGTrainer(env="CartPole-v0",
                            config={
                                "min_iter_time_s": 0,
                                "use_exec_api": True
                            })
     assert isinstance(trainer.train(), dict)
Пример #6
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.

        # tf.
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")

        num_iterations = 2
        for i in range(num_iterations):
            trainer.train()

        # Torch.
        config["use_pytorch"] = True
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        for i in range(num_iterations):
            trainer.train()
Пример #7
0
    def test_add_delete_policy(self):
        env = gym.make("CartPole-v0")

        config = pg.DEFAULT_CONFIG.copy()
        config.update({
            "env": MultiAgentCartPole,
            "env_config": {
                "config": {
                    "num_agents": 4,
                },
            },
            "multiagent": {
                # Start with a single policy.
                "policies": {
                    "p0": (None, env.observation_space, env.action_space, {}),
                },
                "policy_mapping_fn": lambda aid, episode, **kwargs: "p0",
            },
        })

        # TODO: (sven): Fix TrainTFMultiGPU to be flexible wrt adding policies
        #  on-the-fly.
        for _ in framework_iterator(config, frameworks=("tf2", "torch")):
            trainer = pg.PGTrainer(config=config)
            # Given evaluation_interval=2, r0, r2, r4 should not contain
            # evaluation metrics, while r1, r3 should.
            r0 = trainer.train()
            self.assertTrue("p0" in r0["policy_reward_min"])
            for i in range(1, 4):
                # Add a new policy.
                new_pol = trainer.add_policy(
                    f"p{i}",
                    trainer._policy_class,
                    observation_space=env.observation_space,
                    action_space=env.action_space,
                    config={},
                    # Test changing the mapping fn.
                    policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i}",
                    # Change the list of policies to train.
                    policies_to_train=[f"p{i}"],
                )
                pol_map = trainer.workers.local_worker().policy_map
                self.assertTrue(new_pol is not trainer.get_policy("p0"))
                self.assertTrue("p0" in pol_map)
                self.assertTrue("p1" in pol_map)
                self.assertTrue(len(pol_map) == i + 1)
                r = trainer.train()
                self.assertTrue("p1" in r["policy_reward_min"])

            # Delete all added policies again from trainer.
            for i in range(3, 0, -1):
                trainer.remove_policy(
                    f"p{i}",
                    policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}",
                    policies_to_train=[f"p{i - 1}"])

            trainer.stop()
Пример #8
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with all frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["rollout_fragment_length"] = 500
        # Test with filter to see whether they work w/o preprocessing.
        config["observation_filter"] = "MeanStdFilter"
        num_iterations = 1

        image_space = Box(-1.0, 1.0, shape=(84, 84, 3))
        simple_space = Box(-1.0, 1.0, shape=(3,))

        tune.register_env(
            "random_dict_env",
            lambda _: RandomEnv(
                {
                    "observation_space": Dict(
                        {
                            "a": simple_space,
                            "b": Discrete(2),
                            "c": image_space,
                        }
                    ),
                    "action_space": Box(-1.0, 1.0, shape=(1,)),
                }
            ),
        )
        tune.register_env(
            "random_tuple_env",
            lambda _: RandomEnv(
                {
                    "observation_space": Tuple(
                        [simple_space, Discrete(2), image_space]
                    ),
                    "action_space": Box(-1.0, 1.0, shape=(1,)),
                }
            ),
        )

        for _ in framework_iterator(config, with_eager_tracing=True):
            # Test for different env types (discrete w/ and w/o image, + cont).
            for env in [
                "random_dict_env",
                "random_tuple_env",
                "MsPacmanNoFrameskip-v4",
                "CartPole-v0",
                "FrozenLake-v1",
            ]:
                print(f"env={env}")
                trainer = pg.PGTrainer(config=config, env=env)
                for i in range(num_iterations):
                    results = trainer.train()
                    check_train_results(results)
                    print(results)

                check_compute_single_action(trainer, include_prev_action_reward=True)
Пример #9
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        num_iterations = 2

        for _ in framework_iterator(config):
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            for i in range(num_iterations):
                trainer.train()
            check_compute_action(trainer, include_prev_action_reward=True)
Пример #10
0
    def test_bad_envs(self):
        """Tests different "bad env" errors.
        """
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0

        # Non existing/non-registered gym env string.
        env = "Alien-Attack-v42"
        for _ in framework_iterator(config):
            self.assertRaisesRegex(
                EnvError,
                f"The env string you provided \\('{env}'\\) is",
                lambda: pg.PGTrainer(config=config, env=env),
            )

        # Malformed gym env string (must have v\d at end).
        env = "Alien-Attack-part-42"
        for _ in framework_iterator(config):
            self.assertRaisesRegex(
                EnvError,
                f"The env string you provided \\('{env}'\\) is",
                lambda: pg.PGTrainer(config=config, env=env),
            )

        # Non-existing class in a full-class-path.
        env = "ray.rllib.examples.env.random_env.RandomEnvThatDoesntExist"
        for _ in framework_iterator(config):
            self.assertRaisesRegex(
                EnvError,
                f"The env string you provided \\('{env}'\\) is",
                lambda: pg.PGTrainer(config=config, env=env),
            )

        # Non-existing module inside a full-class-path.
        env = "ray.rllib.examples.env.module_that_doesnt_exist.SomeEnv"
        for _ in framework_iterator(config):
            self.assertRaisesRegex(
                EnvError,
                f"The env string you provided \\('{env}'\\) is",
                lambda: pg.PGTrainer(config=config, env=env),
            )
Пример #11
0
    def test_space_inference_from_remote_workers(self):
        # Expect to not do space inference if the learner has an env.

        env = gym.make("CartPole-v0")

        config = pg.DEFAULT_CONFIG.copy()
        config["env"] = "CartPole-v0"
        config["num_workers"] = 1

        # No env on driver -> expect longer build time due to space
        # lookup from remote worker.
        t0 = time.time()
        trainer = pg.PGTrainer(config=config)
        w_lookup = time.time() - t0
        print(f"No env on learner: {w_lookup}sec")
        trainer.stop()

        # Env on driver -> expect shorted build time due to no space
        # lookup required from remote worker.
        config["create_env_on_driver"] = True
        t0 = time.time()
        trainer = pg.PGTrainer(config=config)
        wo_lookup = time.time() - t0
        print(f"Env on learner: {wo_lookup}sec")
        self.assertLess(wo_lookup, w_lookup)
        trainer.stop()

        # Spaces given -> expect shorter build time due to no space
        # lookup required from remote worker.
        config["create_env_on_driver"] = False
        config["observation_space"] = env.observation_space
        config["action_space"] = env.action_space
        t0 = time.time()
        trainer = pg.PGTrainer(config=config)
        wo_lookup = time.time() - t0
        print(f"Spaces given manually in config: {wo_lookup}sec")
        self.assertLess(wo_lookup, w_lookup)
        trainer.stop()
Пример #12
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["rollout_fragment_length"] = 500
        num_iterations = 1

        for _ in framework_iterator(config):
            for env in ["FrozenLake-v0", "CartPole-v0"]:
                trainer = pg.PGTrainer(config=config, env=env)
                for i in range(num_iterations):
                    print(trainer.train())
                check_compute_single_action(
                    trainer, include_prev_action_reward=True)
Пример #13
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0
        num_iterations = 2

        for fw in framework_iterator(config):
            # For tf, build with fake-GPUs.
            config["_fake_gpus"] = fw == "tf"
            config["num_gpus"] = 2 if fw == "tf" else 0
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            for i in range(num_iterations):
                print(trainer.train())
            check_compute_single_action(trainer,
                                        include_prev_action_reward=True)
Пример #14
0
def get_rl_agent(agent_name, config, env_to_agent):
    if agent_name == A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    else:
        raise Exception("Not valid agent name")
    return agent
Пример #15
0
def get_rllib_agent(agent_name, env_name, env, env_to_agent):
    config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {}
    if agent_name == RLLIB_A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    return agent
Пример #16
0
    def test_validate_config_idempotent(self):
        """
        Asserts that validate_config run multiple
        times on COMMON_CONFIG will be idempotent
        """
        # Given:
        standard_config = copy.deepcopy(COMMON_CONFIG)
        trainer = pg.PGTrainer(env="CartPole-v0", config=standard_config)

        # When (we validate config 2 times).
        # Try deprecated `Trainer._validate_config()` method (static).
        trainer._validate_config(standard_config, trainer)
        config_v1 = copy.deepcopy(standard_config)
        # Try new method: `Trainer.validate_config()` (non-static).
        trainer.validate_config(standard_config)
        config_v2 = copy.deepcopy(standard_config)

        # Make sure nothing changed.
        self.assertEqual(config_v1, config_v2)

        trainer.stop()
Пример #17
0
#         array[offset + observation] = 1

# # Can optionally call trainer.restore(path) to load a checkpoint.
# class MyPreprocessor(Preprocessor):
#     def _init_shape(self, obs_space, options):
#         return (4, 4, 1)

#     def transform(self, observation):
#         arr = np.zeros(16, )
#         arr[observation] = 1
#         return arr.reshape(4, 4, 1)

#ModelCatalog.register_custom_preprocessor("my_prep", OneHotPreprocessor)
#config["model"]["custom_preprocessor"] = "my_prep"

#ModelCatalog.register_custom_model("my_model", Dense)
#config["model"]["custom_model"] = "my_model"

#trainer = ppo.PPOTrainer(config=config, env=env_test)
#trainer = dqn.DQNTrainer(config=config, env=env_test)
trainer = pg.PGTrainer(config=config, env=env_test)

for i in range(200):
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    print(pretty_print(result))

    if i % 100 == 0:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)
Пример #18
0
    def test_add_delete_policy(self):
        config = pg.DEFAULT_CONFIG.copy()
        config.update(
            {
                "env": MultiAgentCartPole,
                "env_config": {
                    "config": {
                        "num_agents": 4,
                    },
                },
                "num_workers": 2,  # Test on remote workers as well.
                "num_cpus_per_worker": 0.1,
                "model": {
                    "fcnet_hiddens": [5],
                    "fcnet_activation": "linear",
                },
                "train_batch_size": 100,
                "rollout_fragment_length": 50,
                "multiagent": {
                    # Start with a single policy.
                    "policies": {"p0"},
                    "policy_mapping_fn": lambda aid, eps, worker, **kwargs: "p0",
                    # And only two policies that can be stored in memory at a
                    # time.
                    "policy_map_capacity": 2,
                },
                "evaluation_num_workers": 1,
                "evaluation_config": {
                    "num_cpus_per_worker": 0.1,
                },
            }
        )

        for _ in framework_iterator(config):
            trainer = pg.PGTrainer(config=config)
            pol0 = trainer.get_policy("p0")
            r = trainer.train()
            self.assertTrue("p0" in r["info"][LEARNER_INFO])
            for i in range(1, 3):

                def new_mapping_fn(agent_id, episode, worker, **kwargs):
                    return f"p{choice([i, i - 1])}"

                # Add a new policy.
                pid = f"p{i}"
                new_pol = trainer.add_policy(
                    pid,
                    trainer.get_default_policy_class(config),
                    # Test changing the mapping fn.
                    policy_mapping_fn=new_mapping_fn,
                    # Change the list of policies to train.
                    policies_to_train=[f"p{i}", f"p{i-1}"],
                )
                pol_map = trainer.workers.local_worker().policy_map
                self.assertTrue(new_pol is not pol0)
                for j in range(i + 1):
                    self.assertTrue(f"p{j}" in pol_map)
                self.assertTrue(len(pol_map) == i + 1)
                trainer.train()
                checkpoint = trainer.save()

                # Test restoring from the checkpoint (which has more policies
                # than what's defined in the config dict).
                test = pg.PGTrainer(config=config)
                test.restore(checkpoint)

                # Make sure evaluation worker also gets the restored policy.
                def _has_policy(w):
                    return w.get_policy("p0") is not None

                self.assertTrue(
                    all(test.evaluation_workers.foreach_worker(_has_policy))
                )

                # Make sure trainer can continue training the restored policy.
                pol0 = test.get_policy("p0")
                test.train()
                # Test creating an action with the added (and restored) policy.
                a = test.compute_single_action(
                    np.zeros_like(pol0.observation_space.sample()), policy_id=pid
                )
                self.assertTrue(pol0.action_space.contains(a))
                test.stop()

            # Delete all added policies again from trainer.
            for i in range(2, 0, -1):
                trainer.remove_policy(
                    f"p{i}",
                    # Note that the complete signature of a policy_mapping_fn
                    # is: `agent_id, episode, worker, **kwargs`.
                    policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}",
                    policies_to_train=[f"p{i - 1}"],
                )

            trainer.stop()
Пример #19
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["eager"] = True
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True])
        }

        # tf.
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        vars = policy.model.trainable_variables()

        # Post-process (calculate simple (non-GAE) advantages) and attach to
        # train_batch dict.
        # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
        # [2.9701, 1.99, 1.0]
        train_batch = pg.post_process_advantages(policy, train_batch)
        # Check Advantage values.
        check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

        # Actual loss results.
        results = pg.pg_tf_loss(policy,
                                policy.model,
                                dist_class=Categorical,
                                train_batch=train_batch)

        # Calculate expected results.
        expected_logits = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
               vars[1].numpy()), vars[2].numpy(), vars[3].numpy())
        expected_logp = Categorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp * train_batch[Postprocessing.ADVANTAGES])
        check(results.numpy(), expected_loss, decimals=4)

        # Torch.
        config["use_pytorch"] = True
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        train_batch = policy._lazy_tensor_dict(train_batch)
        results = pg.pg_torch_loss(policy,
                                   policy.model,
                                   dist_class=TorchCategorical,
                                   train_batch=train_batch)
        expected_logits = policy.model.last_output()
        expected_logp = TorchCategorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp.detach().numpy() *
            train_batch[Postprocessing.ADVANTAGES].numpy())
        check(results.detach().numpy(), expected_loss, decimals=4)
Пример #20
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = SampleBatch({
            SampleBatch.OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.EPS_ID:
            np.array([1234, 1234, 1234]),
            SampleBatch.AGENT_INDEX:
            np.array([0, 0, 0]),
        })

        for fw, sess in framework_iterator(config, session=True):
            dist_cls = (Categorical if fw != "torch" else TorchCategorical)
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            vars = policy.model.trainable_variables()
            if sess:
                vars = policy.get_session().run(vars)

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
            # [2.9701, 1.99, 1.0]
            train_batch_ = pg.post_process_advantages(policy,
                                                      train_batch.copy())
            if fw == "torch":
                train_batch_ = policy._lazy_tensor_dict(train_batch_)

            # Check Advantage values.
            check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

            # Actual loss results.
            if sess:
                results = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(train_batch_,
                                                           shuffle=False))
            else:
                results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else
                           pg.pg_torch_loss)(policy,
                                             policy.model,
                                             dist_class=dist_cls,
                                             train_batch=train_batch_)

            # Calculate expected results.
            if fw != "torch":
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[0],
                                        vars[1],
                                        framework=fw),
                                     vars[2],
                                     vars[3],
                                     framework=fw)
            else:
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[2],
                                        vars[3],
                                        framework=fw),
                                     vars[0],
                                     vars[1],
                                     framework=fw)
            expected_logp = dist_cls(expected_logits, policy.model).logp(
                train_batch_[SampleBatch.ACTIONS])
            adv = train_batch_[Postprocessing.ADVANTAGES]
            if sess:
                expected_logp = sess.run(expected_logp)
            elif fw == "torch":
                expected_logp = expected_logp.detach().cpu().numpy()
                adv = adv.detach().cpu().numpy()
            else:
                expected_logp = expected_logp.numpy()
            expected_loss = -np.mean(expected_logp * adv)
            check(results, expected_loss, decimals=4)
Пример #21
0
    (dqn.dqn_policy.DQNTFPolicy, env.observation_space, env.action_space, {})
}
"""
trainer = dqn.DQNTrainer(
        env="multi_air-v0",
        config={
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["dqn_policy"],
            },
            "gamma": 0.99,
            "n_step": 3,
            #"num_gpus": 1,
            #"num_workers": 16
        })
"""

#trainer = dqn.DQNTrainer(env="multi_air-v0")
#trainer = ppo.PPOTrainer(env="multi_air-v0")
#trainer = a3c.A3CTrainer(env="multi_air-v0")
trainer = pg.PGTrainer(env="multi_air-v0")
for i in range(num_train_itr):
    x = trainer.train()
    if i % 100 == 0:
        print("****************************Iteration: ", i,
              "****************************")
        print(pretty_print(x))

trainer.save()
Пример #22
0
def render(checkpoint, home_path):
    """
    Renders pybullet and mujoco environments.
    """
    alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0)
    current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0)
    checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint)
    config = json.load(open(home_path + "params.json"))
    config_bin = pickle.load(open(home_path + "params.pkl", "rb"))
    ray.shutdown()
    import pybullet_envs
    ray.init()
    ModelCatalog.register_custom_model("RBF", RBFModel)
    ModelCatalog.register_custom_model("MLP_2_64", MLP)
    ModelCatalog.register_custom_model("linear", Linear)

    if alg == "PPO":
        trainer = ppo.PPOTrainer(config_bin)
    if alg == "SAC":
        trainer = sac.SACTrainer(config)
    if alg == "DDPG":
        trainer = ddpg.DDPGTrainer(config)
    if alg == "PG":
        trainer = pg.PGTrainer(config)
    if alg == "A3C":
        trainer = a3c.A3CTrainer(config)
    if alg == "TD3":
        trainer = td3.TD3Trainer(config)
    if alg == "ES":
        trainer = es.ESTrainer(config)
    if alg == "ARS":
        trainer = ars.ARSTrainer(config)
#   "normalize_actions": true,
    trainer.restore(checkpoint_path)

    if "Bullet" in current_env:
        env = gym.make(current_env, render=True)
    else:
        env = gym.make(current_env)
    #env.unwrapped.reset_model = det_reset_model
    env._max_episode_steps = 10000
    obs = env.reset()

    action_hist = []
    m_act_hist = []
    state_hist  = []
    obs_hist = []
    reward_hist = []

    done = False
    step = 0

    for t in range(10000):
        # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now
        # mean_actions = out_dict['behaviour_logits'][:17]
        # actions = trainer.compute_action(obs.flatten())
        # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True)
        sampled_actions = trainer.compute_action(obs.flatten())
        # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True)
        
        actions = sampled_actions
        
        obs, reward, done, _ = env.step(np.asarray(actions))
        # env.camera_adjust()
        env.render(mode='human')
        time.sleep(0.01)
        # env.render()
        # env.render(mode='rgb_array', close = True)
        # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0])

        # if step % 1000 == 0:
        #     env.reset()
        # step += 1
        
        action_hist.append(np.copy(actions))
        obs_hist.append(np.copy(obs))
        reward_hist.append(np.copy(reward))
        if done:
            obs = env.reset()
    # print(sum(reward_hist))
    # print((obs_hist))
    #plt.plot(action_hist)
    #plt.figure()
    #plt.figure()
    #plt.plot(obs_hist)
    #plt.figure()

    # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit)
    # trainer.compute_action(obs, full_fetch=True)
    trainer.compute_action(obs)
Пример #23
0
    'env_config': {
        'simu_len': args.simu_len,
        'num_ex': args.num_ex
    }
}

for key, value in my_params.items():
    my_config[key] = value

# initialize the Ray backend

ray.init(address=args.default_ray_address)

# create the RLLib trainer object

trainer = pg.PGTrainer(config=my_config)

# get a reference to Azure ML Run object, to be used to log training metrics

run = Run.get_context()

# execute the RLLib training loop

for i in range(args.num_iterations):
    start_time = time.time()
    result = trainer.train()
    end_time = time.time()

    print(
        'Iteration: {0} - Mean Score: {1} - Min Score: {2} - Max Score: {3} - Elapsed time: {4} s.'
        .format(result['training_iteration'],