Exemplo n.º 1
0
    def test_ddpg_exploration_and_with_random_prerun(self):
        """Tests DDPG's Exploration (w/ random actions for n timesteps)."""
        core_config = ddpg.DEFAULT_CONFIG.copy()
        core_config["num_workers"] = 0  # Run locally.
        obs = np.array([0.0, 0.1, -0.1])

        # Test against all frameworks.
        for _ in framework_iterator(core_config):
            config = core_config.copy()
            # Default OUNoise setup.
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_action(obs, explore=False)
            self.assertEqual(trainer.get_policy().global_timestep, 1)
            for i in range(50):
                a = trainer.compute_action(obs, explore=False)
                self.assertEqual(trainer.get_policy().global_timestep, i + 2)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for i in range(50):
                actions.append(trainer.compute_action(obs))
                self.assertEqual(trainer.get_policy().global_timestep, i + 52)
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # Check randomness at beginning.
            config["exploration_config"] = {
                # Act randomly at beginning ...
                "random_timesteps": 50,
                # Then act very closely to deterministic actions thereafter.
                "ou_base_scale": 0.001,
                "initial_scale": 0.001,
                "final_scale": 0.001,
            }
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            # ts=0 (get a deterministic action as per explore=False).
            deterministic_action = trainer.compute_action(obs, explore=False)
            self.assertEqual(trainer.get_policy().global_timestep, 1)
            # ts=1-49 (in random window).
            random_a = []
            for i in range(1, 50):
                random_a.append(trainer.compute_action(obs, explore=True))
                self.assertEqual(trainer.get_policy().global_timestep, i + 1)
                check(random_a[-1], deterministic_action, false=True)
            self.assertTrue(np.std(random_a) > 0.5)

            # ts > 50 (a=deterministic_action + scale * N[0,1])
            for i in range(50):
                a = trainer.compute_action(obs, explore=True)
                self.assertEqual(trainer.get_policy().global_timestep, i + 51)
                check(a, deterministic_action, rtol=0.1)

            # ts >> 50 (BUT: explore=False -> expect deterministic action).
            for i in range(50):
                a = trainer.compute_action(obs, explore=False)
                self.assertEqual(trainer.get_policy().global_timestep, i + 101)
                check(a, deterministic_action)
            trainer.stop()
Exemplo n.º 2
0
    def __init__(self, agent_name, env, config, logger_creator):
        assert agent_name in [
            "td3",
            "ddpg",
            "ppo",
        ], "Some policies are not currently supported (dqn,sac)"  # dqn and sac not currently supported
        self.agent_name = agent_name

        if self.agent_name == "ppo":
            self.trainer = ppo.PPOTrainer(
                env=env,
                config=config,
                logger_creator=logger_creator,
            )
        elif self.agent_name == "ddpg":
            self.trainer = ddpg.DDPGTrainer(
                env=env,
                config=config,
                logger_creator=logger_creator,
            )
        elif self.agent_name == "td3":
            self.trainer = ddpg.TD3Trainer(
                env=env,
                config=config,
                logger_creator=logger_creator,
            )
Exemplo n.º 3
0
    def test_ddpg_compilation(self):
        """Test whether a DDPGTrainer can be built with both frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["num_envs_per_worker"] = 2
        config["learning_starts"] = 0
        config["exploration_config"]["random_timesteps"] = 100

        num_iterations = 1

        # Test against all frameworks.
        for _ in framework_iterator(config):
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
            check_compute_single_action(trainer)
            # Ensure apply_gradient_fn is being called and updating global_step
            if config["framework"] == "tf":
                a = trainer.get_policy().global_step.eval(
                    trainer.get_policy().get_session())
            else:
                a = trainer.get_policy().global_step
            check(a, 500)
            trainer.stop()
Exemplo n.º 4
0
    def test(self, algo, path, lr, fc_hid, fc_act):
        """Test trained agent for a single episode. Return the episode reward"""
        # instantiate env class
        unused_shared = []
        unused_own = []
        unsatisfied_shared = []
        unsatisfied_own = []

        episode_reward = 0

        #self.config["num_workers"] = 0
        self.config["lr"] = lr
        self.config['model']["fcnet_hiddens"] = fc_hid
        self.config['model']["fcnet_activation"] = fc_act

        if algo == "ppo":
            self.agent = ppo.PPOTrainer(config=self.config)
        if algo == "ddpg":
            self.agent = ddpg.DDPGTrainer(config=self.config)
        if algo == "a3c":
            self.agent = a3c.A3CTrainer(config=self.config)
        if algo == "impala":
            self.agent = impala.ImpalaTrainer(config=self.config)
        if algo == "appo":
            self.agent = ppo.APPOTrainer(config=self.config)
        if algo == "td3":
            self.agent = ddpg.TD3Trainer(config=self.config)

        self.agent.restore(path)

        env = caching_vM(config=self.config)

        obs = env.reset()
        done = False

        action = {}
        for agent_id, agent_obs in obs.items():
            policy_id = self.config['multiagent']['policy_mapping_fn'](
                agent_id)
            action[agent_id] = self.agent.compute_action(agent_obs,
                                                         policy_id=policy_id)
        obs, reward, done, info = env.step(action)
        done = done['__all__']

        for x in range(len(info)):
            res = ast.literal_eval(info[x])
            unused_shared.append(res[0])
            unused_own.append(res[1])
            unsatisfied_shared.append(res[2])
            unsatisfied_own.append(res[3])

        print("reward == ", reward)
        # sum up reward for all agents
        episode_reward += sum(reward.values())

        return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
Exemplo n.º 5
0
    def test(self,algo, path, lr, fc_hid, fc_act):

        """Test trained agent for a single episode. Return the episode reward"""
        # instantiate env class
        unused_shared = []
        unused_own = []
        unsatisfied_shared = []
        unsatisfied_own = []

        episode_reward = 0
        self.config_test["num_workers"] = 0
        self.config_test["lr"] = lr
        self.config_test['model']["fcnet_hiddens"] = fc_hid
        self.config_test['model']["fcnet_activation"] = fc_act

        if algo == "ppo":
            self.agent = ppo.PPOTrainer(config=self.config_test)
        if algo == "ddpg":
            self.agent = ddpg.DDPGTrainer(config=self.config_test)
        if algo == "a3c":
            self.agent = a3c.A3CTrainer(config=self.config_test)
        if algo == "impala":
            self.agent = impala.ImpalaTrainer(config=self.config_test)
        if algo == "appo":
            self.agent = ppo.APPOTrainer(config=self.config_test)
        if algo == "td3":
            self.agent = ddpg.TD3Trainer(config=self.config_test)

        self.agent.restore(path)

        #env = self.agent.workers.local_worker().env
        #env = self.env_class(self.env_config)
        #env = ContentCaching(*self.config_train)
        #env = self.config_train["env"]#env_config)
        #env = self.env_class(3)
        #env = ContentCaching
        #env = self.env
        #self.env = ContentCaching
        #env = self.config_train["env"]
        
     
        obs = ContentCaching.reset()
        done = False

        while not done:
            action = self.agent.compute_action(obs)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward

            unused_shared.append(info["unused_shared"])
            unused_own.append(info["unused_own"])
            unsatisfied_shared.append(info["unsatisfied_shared"])
            unsatisfied_own.append(info["unsatisfied_own"])
        
        return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
Exemplo n.º 6
0
    def test_ddpg_compilation(self):
        """Test whether a DDPGTrainer can be built with both frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.

        # Test against all frameworks.
        for _ in framework_iterator(config, "tf"):
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            num_iterations = 2
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
Exemplo n.º 7
0
    def test_ddpg_checkpoint_save_and_restore(self):
        """Test whether a DDPGTrainer can save and load checkpoints."""
        config = ddpg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["num_envs_per_worker"] = 2
        config["learning_starts"] = 0
        config["exploration_config"]["random_timesteps"] = 100

        # Test against all frameworks.
        for _ in framework_iterator(config):
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            trainer.train()
            with TemporaryDirectory() as temp_dir:
                checkpoint = trainer.save(temp_dir)
                trainer.restore(checkpoint)
            trainer.stop()
Exemplo n.º 8
0
    def test_ddpg_compilation(self):
        """Test whether a DDPGTrainer can be built with both frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.

        # Test against all frameworks.
        for fw in ["tf", "eager", "torch"]:
            if fw != "tf":
                continue
            config["eager"] = True if fw == "eager" else False
            config["use_pytorch"] = True if fw == "torch" else False
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            num_iterations = 2
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
Exemplo n.º 9
0
    def test_ddpg_fake_multi_gpu_learning(self):
        """Test whether DDPGTrainer can run SimpleEnv w/ faked multi-GPU."""
        config = ddpg.DEFAULT_CONFIG.copy()
        # Fake GPU setup.
        config["num_gpus"] = 2
        config["_fake_gpus"] = True
        env = "ray.rllib.agents.sac.tests.test_sac.SimpleEnv"
        config["env_config"] = {"config": {"repeat_delay": 0}}

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = ddpg.DDPGTrainer(config=config, env=env)
            num_iterations = 2
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
            trainer.stop()
Exemplo n.º 10
0
    def test_ddpg_compilation(self):
        """Test whether a DDPGTrainer can be built with both frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["num_envs_per_worker"] = 2
        config["learning_starts"] = 0
        config["exploration_config"]["random_timesteps"] = 100

        num_iterations = 2

        # Test against all frameworks.
        for _ in framework_iterator(config, ("tf", "torch")):
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
            check_compute_single_action(trainer)
Exemplo n.º 11
0
    def test_ddpg_fake_multi_gpu_learning(self):
        """Test whether DDPGTrainer can learn CartPole w/ faked multi-GPU."""
        config = ddpg.DEFAULT_CONFIG.copy()
        # Fake GPU setup.
        config["num_gpus"] = 2
        config["_fake_gpus"] = True
        env = "ray.rllib.agents.sac.tests.test_sac.SimpleEnv"
        config["env_config"] = {"config": {"repeat_delay": 0}}

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = ddpg.DDPGTrainer(config=config, env=env)
            num_iterations = 50
            learnt = False
            for i in range(num_iterations):
                results = trainer.train()
                print(f"R={results['episode_reward_mean']}")
                if results["episode_reward_mean"] > 70.0:
                    learnt = True
                    break
            assert learnt, \
                f"DDPG multi-GPU (with fake-GPUs) did not learn {env}!"
            trainer.stop()
Exemplo n.º 12
0
    def _continuous_run(self):
        import ray
        from ray import tune
        from ray.rllib.agents import ppo,ddpg
        ray.init(num_cpus=4,num_gpus=1,local_mode=True)
        configs={
            'num_gpus':1,
            'num_workers':4,
            # 'num_gpus_per_worker':1,
            'framework':'torch',
            "simple_optimizer":True,
        }
        AGENT_CONFIG={
            'ddpg':ddpg.DDPGTrainer(config=configs,env="MountainCarContinuous-v0"),
            'ppo':ppo.PPOTrainer(config=configs,env="MountainCarContinuous-v0"),
        }
        trainer=AGENT_CONFIG[self.configs['algorithm']]
        # tune.run(agent, config={"env": "MountainCarContinuous-v0","framework":"torch","num_gpus":0,})
        for i in range(2000): # 2000epoch
            result=trainer.train()#1 epoch
            print(result)

        return
Exemplo n.º 13
0
    def test_ddpg_loss_function(self):
        """Tests DDPG loss function results across all frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        config["learning_starts"] = 0
        config["twin_q"] = True
        config["use_huber"] = True
        config["huber_threshold"] = 1.0
        config["gamma"] = 0.99
        # Make this small (seems to introduce errors).
        config["l2_reg"] = 1e-10
        config["prioritized_replay"] = False
        # Use very simple nets.
        config["actor_hiddens"] = [10]
        config["critic_hiddens"] = [10]
        # Make sure, timing differences do not affect trainer.train().
        config["min_iter_time_s"] = 0
        config["timesteps_per_iteration"] = 100

        map_ = {
            # Normal net.
            "default_policy/actor_hidden_0/kernel":
            "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0/bias":
            "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out/kernel":
            "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out/bias":
            "policy_model.action_out."
            "_model.0.bias",
            "default_policy/sequential/q_hidden_0/kernel":
            "q_model.q_hidden_0"
            "._model.0.weight",
            "default_policy/sequential/q_hidden_0/bias":
            "q_model.q_hidden_0."
            "_model.0.bias",
            "default_policy/sequential/q_out/kernel":
            "q_model.q_out._model."
            "0.weight",
            "default_policy/sequential/q_out/bias":
            "q_model.q_out._model."
            "0.bias",
            # -- twin.
            "default_policy/sequential_1/twin_q_hidden_0/kernel":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_1/twin_q_hidden_0/bias":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_1/twin_q_out/kernel":
            "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_1/twin_q_out/bias":
            "twin_"
            "q_model.twin_q_out._model.0.bias",
            # Target net.
            "default_policy/actor_hidden_0_1/kernel":
            "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0_1/bias":
            "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out_1/kernel":
            "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out_1/bias":
            "policy_model.action_out._model"
            ".0.bias",
            "default_policy/sequential_2/q_hidden_0/kernel":
            "q_model."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_2/q_hidden_0/bias":
            "q_model."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_2/q_out/kernel":
            "q_model."
            "q_out._model.0.weight",
            "default_policy/sequential_2/q_out/bias":
            "q_model."
            "q_out._model.0.bias",
            # -- twin.
            "default_policy/sequential_3/twin_q_hidden_0/kernel":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_3/twin_q_hidden_0/bias":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_3/twin_q_out/kernel":
            "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_3/twin_q_out/bias":
            "twin_"
            "q_model.twin_q_out._model.0.bias",
        }

        env = SimpleEnv
        batch_size = 100
        if env is SimpleEnv:
            obs_size = (batch_size, 1)
            actions = np.random.random(size=(batch_size, 1))
        elif env == "CartPole-v0":
            obs_size = (batch_size, 4)
            actions = np.random.randint(0, 2, size=(batch_size, ))
        else:
            obs_size = (batch_size, 3)
            actions = np.random.random(size=(batch_size, 1))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_t = None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(config,
                                           frameworks=("tf", "torch"),
                                           session=True):
            # Generate Trainer and get its default Policy object.
            trainer = ddpg.DDPGTrainer(config=config, env=env)
            policy = trainer.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                assert fw == "tf"  # Start with the tf vars-dict.
                weights_dict = policy.get_weights()
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(
                    weights_dict, map_)
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "torch":
                # Actually convert to torch tensors.
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_t = \
                    self._ddpg_loss_helper(
                        input_, weights_dict, sorted(weights_dict.keys()), fw,
                        gamma=config["gamma"],
                        huber_threshold=config["huber_threshold"],
                        l2_reg=config["l2_reg"],
                        sess=sess)

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, t, tf_c_grads, tf_a_grads = \
                    p_sess.run([
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.td_error,
                        policy._critic_optimizer.compute_gradients(
                            policy.critic_loss,
                            policy.model.q_variables()),
                        policy._actor_optimizer.compute_gradients(
                            policy.actor_loss,
                            policy.model.policy_variables())],
                        feed_dict=policy._get_loss_inputs_dict(
                            input_, shuffle=False))
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]

            elif fw == "torch":
                loss_torch(policy, policy.model, None, input_)
                c, a, t = policy.critic_loss, policy.actor_loss, \
                    policy.td_error
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                # Test actor gradients.
                policy._actor_optimizer.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None
                           for v in policy.model.policy_variables())
                a.backward()
                # `actor_loss` depends on Q-net vars
                # (but not twin-Q-net vars!).
                assert not any(v.grad is None
                               for v in policy.model.q_variables()[:4])
                assert all(v.grad is None
                           for v in policy.model.q_variables()[4:])
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.policy_variables())
                # Compare with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g))
                    else:
                        check(tf_g, torch_g)

                # Test critic gradients.
                policy._critic_optimizer.zero_grad()
                assert all(v.grad is None or torch.mean(v.grad) == 0.0
                           for v in policy.model.q_variables())
                assert all(v.grad is None or torch.min(v.grad) == 0.0
                           for v in policy.model.q_variables())
                c.backward()
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.q_variables())
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.q_variables())
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                for tf_g, torch_g in zip(tf_c_grads, torch_c_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g))
                    else:
                        check(tf_g, torch_g)
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g))
                    else:
                        check(tf_g, torch_g)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(t, prev_fw_loss[2])

            prev_fw_loss = (c, a, t)

            # Update weights from our batch (n times).
            for update_iteration in range(10):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(updated_weights[
                            "default_policy/actor_hidden_0/kernel"],
                              tf_updated_weights[-1]
                              ["default_policy/actor_hidden_0/kernel"],
                              false=True)
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    # Compare updated model and target weights.
                    for tf_key in tf_weights.keys():
                        tf_var = tf_weights[tf_key]
                        # Model.
                        if re.search(
                                "actor_out_1|actor_hidden_0_1|sequential_"
                                "[23]", tf_key):
                            torch_var = policy.target_model.state_dict()[
                                map_[tf_key]]
                        # Target model.
                        else:
                            torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var, np.transpose(torch_var), rtol=0.07)
                        else:
                            check(tf_var, torch_var, rtol=0.07)
    config["num_gpus"] = 0

if algorithm == 'A2C':
    RLAgent = a2c.A2CTrainer(env=env_name, config=config)
elif algorithm == 'ADQN':
    RLAgent = adqn.ApexTrainer(env=env_name, config=config)
elif algorithm == 'DQN':
    RLAgent = dqn.DQNTrainer(env=env_name, config=config)
elif algorithm == 'IMPALA':
    RLAgent = impala.ImpalaTrainer(env=env_name, config=config)
elif algorithm == 'PPO':
    RLAgent = ppo.PPOTrainer(env=env_name, config=config)
elif algorithm == 'RDQN':
    RLAgent = dqn.DQNTrainer(env=env_name, config=config)
elif algorithm == "DDPG":
    RLAgent = ddpg.DDPGTrainer(env=env_name, config=config)

print(checkpoint_path, flush=True)
#RLAgent.restore(checkpoint_path)

num_runs = 50
totalRewards = np.empty((num_runs, ))

policy = RLAgent.get_policy("policy_0")
for j in range(num_runs):
    observations = env.reset()
    rewards, action_dict = {}, {}
    for agent_id in env.agents:
        rewards[agent_id] = 0

    totalReward = 0
Exemplo n.º 15
0
def render(checkpoint, home_path):
    """
    Renders pybullet and mujoco environments.
    """
    alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0)
    current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0)
    checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint)
    config = json.load(open(home_path + "params.json"))
    config_bin = pickle.load(open(home_path + "params.pkl", "rb"))
    ray.shutdown()
    import pybullet_envs
    ray.init()
    ModelCatalog.register_custom_model("RBF", RBFModel)
    ModelCatalog.register_custom_model("MLP_2_64", MLP)
    ModelCatalog.register_custom_model("linear", Linear)

    if alg == "PPO":
        trainer = ppo.PPOTrainer(config_bin)
    if alg == "SAC":
        trainer = sac.SACTrainer(config)
    if alg == "DDPG":
        trainer = ddpg.DDPGTrainer(config)
    if alg == "PG":
        trainer = pg.PGTrainer(config)
    if alg == "A3C":
        trainer = a3c.A3CTrainer(config)
    if alg == "TD3":
        trainer = td3.TD3Trainer(config)
    if alg == "ES":
        trainer = es.ESTrainer(config)
    if alg == "ARS":
        trainer = ars.ARSTrainer(config)
#   "normalize_actions": true,
    trainer.restore(checkpoint_path)

    if "Bullet" in current_env:
        env = gym.make(current_env, render=True)
    else:
        env = gym.make(current_env)
    #env.unwrapped.reset_model = det_reset_model
    env._max_episode_steps = 10000
    obs = env.reset()

    action_hist = []
    m_act_hist = []
    state_hist  = []
    obs_hist = []
    reward_hist = []

    done = False
    step = 0

    for t in range(10000):
        # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now
        # mean_actions = out_dict['behaviour_logits'][:17]
        # actions = trainer.compute_action(obs.flatten())
        # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True)
        sampled_actions = trainer.compute_action(obs.flatten())
        # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True)
        
        actions = sampled_actions
        
        obs, reward, done, _ = env.step(np.asarray(actions))
        # env.camera_adjust()
        env.render(mode='human')
        time.sleep(0.01)
        # env.render()
        # env.render(mode='rgb_array', close = True)
        # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0])

        # if step % 1000 == 0:
        #     env.reset()
        # step += 1
        
        action_hist.append(np.copy(actions))
        obs_hist.append(np.copy(obs))
        reward_hist.append(np.copy(reward))
        if done:
            obs = env.reset()
    # print(sum(reward_hist))
    # print((obs_hist))
    #plt.plot(action_hist)
    #plt.figure()
    #plt.figure()
    #plt.plot(obs_hist)
    #plt.figure()

    # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit)
    # trainer.compute_action(obs, full_fetch=True)
    trainer.compute_action(obs)
Exemplo n.º 16
0
            "pol0":
            (None, temp_env.observation_space[0], temp_env.action_space[0], {
                "agent_id": 0,
            }),
            "pol1":
            (None, temp_env.observation_space[1], temp_env.action_space[1], {
                "agent_id": 1,
            }),
        },
        "policy_mapping_fn": lambda x: "pol0"
        if x == 0 else "pol1",  # # Function mapping agent ids to policy ids
        # "observation_fn": central_critic_observer, # See rllib/evaluation/observation_function.py for more info
    }

    #### Restore agent #########################################
    agent = ddpg.DDPGTrainer(config=config)
    # with open(ARGS.exp+'/checkpoint.txt', 'r+') as f:
    #     checkpoint = f.read()
    checkpoint = "/home/mahendra/git/gym-pybullet-drones/experiments/learning/results/save-payloadcoop-2-cc-payload_one_sensor-xyz_yaw-03.25.2021_20.20.48/DDPG_2021-03-25_20-20-51/DDPG_this-aviary-v0_ebf05_00000_0_2021-03-25_20-20-52/checkpoint_40/checkpoint-40"
    agent.restore(checkpoint)

    #### Extract and print policies ############################
    policy0 = agent.get_policy("pol0")
    # print("action model 0", policy0.model.action_model)
    # print("value model 0", policy0.model.value_model)
    policy1 = agent.get_policy("pol1")
    # print("action model 1", policy1.model.action_model)
    # print("value model 1", policy1.model.value_model)

    #### Create test environment ###############################
    test_env = PayloadCoop(num_drones=NUM_DRONES,