Exemplo n.º 1
0
    def test_multi_gpu_dqn_agent_compilation(self):
        """
        Tests if the multi gpu strategy can compile successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        root_logger.setLevel(DEBUG)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_random_env.json")
        environment = RandomEnv.from_spec(self.random_env_spec)

        agent = DQNAgent.from_spec(agent_config,
                                   state_space=environment.state_space,
                                   action_space=environment.action_space)
        print("Compiled DQN agent on multi-GPU system")

        # Do an update from external batch.
        batch_size = agent_config["update_spec"]["batch_size"]
        external_batch = dict(
            states=environment.state_space.sample(size=batch_size),
            actions=environment.action_space.sample(size=batch_size),
            rewards=np.random.sample(size=batch_size),
            terminals=np.random.choice([True, False], size=batch_size),
            next_states=environment.state_space.sample(size=batch_size),
            importance_weights=np.zeros(shape=(batch_size, )))
        agent.update(batch=external_batch)
        print("Performed an update from external batch")
Exemplo n.º 2
0
    def test_multi_gpu_apex_agent_compilation(self):
        """
        Tests if the multi gpu strategy can compile successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        root_logger.setLevel(DEBUG)
        agent_config = config_from_path("configs/multi_gpu_ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = ApexAgent.from_spec(
            agent_config, state_space=environment.state_space, action_space=environment.action_space
        )
        print("Compiled Apex agent")
Exemplo n.º 3
0
    def test_multi_gpu_apex_agent_compilation(self):
        """
        Tests if the multi gpu strategy can compile successfully on a multi gpu system.

        THIS TEST REQUIRES A MULTI GPU SYSTEM.
        """
        root_logger.setLevel(DEBUG)
        agent_config = config_from_path("configs/multi_gpu_ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = ApexAgent.from_spec(
            agent_config, state_space=environment.state_space, action_space=environment.action_space
        )
        print("Compiled Apex agent")
Exemplo n.º 4
0
class TestPyTorchUtil(unittest.TestCase):
    """
    Tests some torch utils.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_one_hot(self):
        """
        Tests a torch one hot function.
        """
        if get_backend() == "pytorch":
            # Flat action array.
            inputs = torch.tensor([0, 1], dtype=torch.int32)
            one_hot = pytorch_one_hot(inputs, depth=2)

            expected = torch.tensor([[1., 0.], [0., 1.]])
            recursive_assert_almost_equal(one_hot, expected)

            # Container space.
            inputs = torch.tensor([[0, 3, 2], [1, 2, 0]], dtype=torch.int32)
            one_hot = pytorch_one_hot(inputs, depth=4)

            expected = torch.tensor(
                [[[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]],
                 [[0, 1, 0, 0], [0, 0, 1, 0], [
                     1,
                     0,
                     0,
                     0,
                 ]]],
                dtype=torch.int32)
            recursive_assert_almost_equal(one_hot, expected)
class TestPPOAgentFunctionality(unittest.TestCase):
    """
    Tests the PPO Agent's functionality.
    """
    root_logger.setLevel(level=logging.DEBUG)

    def test_post_processing(self):
        """
        Tests external batch post-processing for the PPO agent.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)
        agent_config = config_from_path("configs/ppo_agent_for_pong.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)
        num_samples = 200
        states = agent.preprocessed_state_space.sample(num_samples)
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        sequence_indices_space = BoolBox(add_batch_rank=True)

        # GAE is separately tested, just testing if this API method returns results.
        pg_advantages = agent.post_process(
            dict(states=states,
                 rewards=reward_space.sample(num_samples),
                 terminals=terminal_space.sample(num_samples, fill_value=0),
                 sequence_indices=sequence_indices_space.sample(num_samples,
                                                                fill_value=0)))
class TestBaseAgentFunctionality(unittest.TestCase):
    """
    Tests the base Agent's functionality.
    """
    root_logger.setLevel(level=logging.DEBUG)

    def test_weights_getting_setting(self):
        """
        Tests getting and setting of the Agent's weights.
        """
        env = GridWorld(world="2x2")
        agent = Agent.from_spec(
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        new_weights = {}
        for key, weight in weights["policy_weights"].items():
            new_weights[key] = weight + 0.01

        agent.set_weights(new_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(new_actual_weights["policy_weights"],
                                      new_weights)
class TestDQNAgentLongTaskLearning(unittest.TestCase):
    """
    Tests whether the DQNAgent can learn in tough environments.
    """
    root_logger.setLevel(level=logging.INFO)

    pong_preprocessed_state_space = FloatBox(shape=(80, 80, 4),
                                             add_batch_rank=True)

    def test_dqn_on_pong(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True,
                           visualize=False)
        agent_config = config_from_path("configs/dqn_agent_for_pong.json")
        preprocessing_spec = agent_config.pop("preprocessor_spec")
        agent = Agent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=self.pong_preprocessed_state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

        time_steps = 4000000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      render=True,
                                      preprocessing_spec=preprocessing_spec,
                                      worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
Exemplo n.º 8
0
class TestInputIncompleteTest(unittest.TestCase):
    """
    Tests for different scenarios, where a model is build, but pieces remain input-incomplete (these should be
    reported then by meaningful error messages).
    """
    root_logger.setLevel(level=logging.INFO)

    def test_inner_deadlock_of_component(self):
        """
        Component cannot be built due to its sub-component remaining input incomplete.
        """
        a = DummyProducingInputIncompleteBuild(scope="A")
        try:
            test = ComponentTest(component=a, input_spaces=dict(input_=float))
        except RLGraphBuildError as e:
            print("Seeing expected RLGraphBuildError ({}). Test ok.".format(e))
        else:
            raise RLGraphError("Not seeing expected RLGraphBuildError with input-incomplete model!")

    def test_solution_of_inner_deadlock_of_component_with_must_be_complete_false(self):
        """
        Component can be built due to its sub-component resolving a deadlock with `must_be_complete`.
        """
        a = DummyProducingInputIncompleteBuild(scope="A")
        deadlock_component = a.sub_components["dummy-calling-one-api-from-within-other"]
        # Manually set the must_be_complete flag to false.
        deadlock_component.api_methods["run_inner"].must_be_complete = False
        test = ComponentTest(component=a, input_spaces=dict(input_=float))
        print("Not seeing RLGraphBuildError. Test ok.")
Exemplo n.º 9
0
class TestApexAgentFunctionality(unittest.TestCase):
    """
    Tests Ape-X specific functionality.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_apex_weight_syncing(self):
        env = RandomEnv(state_space=spaces.IntBox(2),
                        action_space=spaces.IntBox(2),
                        deterministic=True)

        agent = Agent.from_spec(
            config_from_path("configs/apex_agent_for_random_env.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        policy_weights = agent.get_policy_weights()
        print('policy weights: {}'.format(policy_weights))

        for variable, weights in policy_weights.items():
            weights += 0.01
        agent.set_policy_weights(policy_weights)

        new_weights = agent.get_policy_weights()
        recursive_assert_almost_equal(policy_weights, new_weights)
Exemplo n.º 10
0
class TestInputSpaceChecking(unittest.TestCase):
    """
    Tests whether faulty ops are caught after calling `sanity_check_space` in `check_input_spaces` of a Component.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_faulty_op_catching(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes two containers through it
        with flatten/split options enabled.
        """
        # Construct some easy component containing a sub-component.
        dense_layer = DenseLayer(units=2, scope="dense-layer")
        string_layer = EmbeddingLookup(embed_dim=3,
                                       vocab_size=4,
                                       scope="embed-layer")
        container_component = Component(dense_layer, string_layer)

        # Add the component's API method.
        @rlgraph_api(component=container_component)
        def test_api(self, a):
            dense_result = self.get_sub_component_by_name("dense-layer").call(
                a)
            # First call dense to get a vector output, then call embedding, which is expecting an int input.
            # This should fail EmbeddingLookup's input space checking (only during the build phase).
            return self.get_sub_component_by_name("embed-layer").call(
                dense_result)

        # Test graphviz component graph drawing.
        draw_meta_graph(container_component, apis=True)

        test = ComponentTest(
            component=container_component,
            input_spaces=dict(
                a=spaces.FloatBox(shape=(4, ), add_batch_rank=True)))
class TestActorCriticShortTaskLearning(unittest.TestCase):
    """
    Tests whether the Actor-critic can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)

    is_windows = os.name == "nt"

    def test_actor_critic_on_2x2_grid_world(self):
        """
        Creates a Actor-critic and runs it via a Runner on the 2x2 Grid World Env.
        """
        env = GridWorld(world="2x2")
        agent = ActorCriticAgent.from_spec(
            config_from_path(
                "configs/actor_critic_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=13),
        )

        time_steps = 30000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=True,
            preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        # Assume we have learned something.
        self.assertGreater(results["mean_episode_reward"], -0.1)

    def test_actor_critic_on_cart_pole(self):
        """
        Creates an Actor-critic and runs it via a Runner on the CartPole Env.
        """
        env_spec = dict(type="open-ai-gym",
                        gym_env="CartPole-v0",
                        visualize=False)  #self.is_windows)
        dummy_env = OpenAIGymEnv.from_spec(env_spec)
        agent = ActorCriticAgent.from_spec(
            config_from_path("configs/actor_critic_agent_for_cartpole.json"),
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space)

        time_steps = 20000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 20)
        self.assertGreaterEqual(results["max_episode_reward"], 100.0)
Exemplo n.º 12
0
class TestIMPALAAgentLongTaskLearning(unittest.TestCase):
    """
    Tests whether the DQNAgent can learn in tough environments.
    """
    root_logger.setLevel(level=logging.INFO)

    #atari_preprocessed_state_space = FloatBox(shape=(80, 80, 4), add_batch_rank=True)
    #atari_preprocessing_spec = [
    #    dict(type="image_crop", x=0, y=25, width=160, height=160),
    #    dict(type="image_resize", width=80, height=80),
    #    dict(type="grayscale", keep_rank=True),
    #    dict(type="divide", divisor=255,),
    #    dict(type="sequence", sequence_length=4, batch_size=1, add_rank=False)
    #]

    def test_impala_on_outbreak(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Breakout-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True,
                           visualize=False)
        config_ = config_from_path("configs/impala_agent_for_breakout.json")
        agent = IMPALAAgent.from_spec(
            config_,
            state_space=env.state_space,
            action_space=env.action_space,
        )

        learn_updates = 4000000
        mean_returns = []
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            mean_returns.append(mean_return)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(
                i, float(ret[1]), mean_return))

        time.sleep(3)
        agent.terminate()
        time.sleep(3)

    @staticmethod
    def _calc_mean_return(records):
        size = records[3]["rewards"].size
        rewards = records[3]["rewards"].reshape((size, ))
        terminals = records[3]["terminals"].reshape((size, ))
        returns = list()
        return_ = 0.0
        for r, t in zip(rewards, terminals):
            return_ += r
            if t:
                returns.append(return_)
                return_ = 0.0

        return np.mean(returns)
Exemplo n.º 13
0
class TestPPOShortTaskLearning(unittest.TestCase):
    """
    Tests whether the PPO agent can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)

    is_windows = os.name == "nt"

    def test_ppo_on_2x2_grid_world(self):
        """
        Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World Env.
        """
        env = GridWorld(world="2x2")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=15),
        )

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=True,
            preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        # Assume we have learned something.
        self.assertGreater(results["mean_episode_reward"], -0.2)

    def test_ppo_on_cart_pole(self):
        """
        Creates a PPO Agent and runs it via a Runner on the CartPole Env.
        """
        env = OpenAIGymEnv("CartPole-v0", seed=36)
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        time_steps = 3000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], 23)
        #self.assertGreaterEqual(results["max_episode_reward"], 100.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 10)
Exemplo n.º 14
0
class TestVisualizations(unittest.TestCase):
    """
    Tests whether components and meta-(sub)-graphs get visualized properly.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_ppo_agent_visualization(self):
        """
        Creates a PPOAgent and visualizes meta-graph (no APIs) and the NN-component.
        """
        env = GridWorld(world="2x2")
        env.render()
        ppo_agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space)

        # Test graphviz component-graph drawing.
        draw_meta_graph(ppo_agent.root_component,
                        output=rlgraph_dir + "/ppo.gv",
                        apis=False,
                        graph_fns=False)
        self.assertTrue(os.path.isfile(rlgraph_dir + "/ppo.gv"))
        # Test graphviz component-graph w/ API drawing (only the Policy component).
        draw_meta_graph(ppo_agent.policy.neural_network,
                        output=rlgraph_dir + "/ppo_nn.gv",
                        apis=True)
        self.assertTrue(os.path.isfile(rlgraph_dir + "/ppo_nn.gv"))

    def test_ppo_agent_faulty_op_visualization(self):
        """
        Creates a PPOAgent with a badly connected network and visualizes the root component.
        """
        agent_config = config_from_path(
            "configs/ppo_agent_for_2x2_gridworld.json")
        # Sabotage the NN.
        agent_config["network_spec"] = [{
            "type": "dense",
            "units": 10
        }, {
            "type": "embedding",
            "embed_dim": 3,
            "vocab_size": 4
        }]
        env = GridWorld(world="2x2")
        # Build Agent and hence trigger the Space error.
        try:
            ppo_agent = PPOAgent.from_spec(
                agent_config,
                state_space=GridWorld.grid_world_2x2_flattened_state_space,
                action_space=env.action_space)
        except RLGraphSpaceError as e:
            print("Seeing expected RLGraphSpaceError ({}). Test ok.".format(e))
        else:
            raise RLGraphError(
                "Not seeing expected RLGraphSpaceError with faulty input Space to embed layer of PPO!"
            )
Exemplo n.º 15
0
    def __init__(self, worker, seed=10, logging_level=None, enable_profiler=False):
        """
        Args:
            worker (Worker): The Worker (holding the Env and Agent) to use for stepping.
            #seed (Optional[int]): The seed to use for random-seeding the Model object.
            #    If None, do not seed the Graph (things may behave non-deterministically).
            logging_level (Optional[int]): When provided, sets RLGraph's root_logger's logging level to this value.
            enable_profiler (Optional(bool)): When enabled, activates backend profiling.
        """
        self.worker = worker
        self.agent = self.worker.agent
        self.env = self.worker.vector_env.get_env()
        self.seed = seed
        if logging_level is not None:
            root_logger.setLevel(logging_level)

        # Simply use the Agent's GraphExecutor.
        self.graph_executor = self.agent.graph_executor
Exemplo n.º 16
0
class TestBaseAgentFunctionality(unittest.TestCase):
    """
    Tests the base Agent's functionality.
    """
    root_logger.setLevel(level=logging.DEBUG)

    def test_weights_getting_setting(self):
        """
        Tests getting and setting of the Agent's weights.
        """
        env = GridWorld(world="2x2")
        agent = Agent.from_spec(
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        new_weights = {}
        for key, weight in weights["policy_weights"].items():
            new_weights[key] = weight + 0.01

        agent.set_weights(new_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(new_actual_weights["policy_weights"],
                                      new_weights)

    def test_value_function_weights(self):
        """
        Tests changing of value function weights.
        """
        env = OpenAIGymEnv("Pong-v0")
        agent_config = config_from_path("configs/ppo_agent_for_pong.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)
        weights = agent.get_weights()
        assert "value_function_weights" in weights
        assert "policy_weights" in weights

        policy_weights = weights["policy_weights"]
        value_function_weights = weights["value_function_weights"]

        # Just change vf weights.
        for key, weight in value_function_weights.items():
            value_function_weights[key] = weight + 0.01
        agent.set_weights(policy_weights, value_function_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(
            new_actual_weights["value_function_weights"],
            value_function_weights)
Exemplo n.º 17
0
class TestApexAgentFunctionality(unittest.TestCase):
    """
    Tests Ape-X specific functionality.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_apex_weight_syncing(self):
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = Agent.from_spec(
            agent_config,
            state_space=environment.state_space,
            action_space=environment.action_space
        )

        weights = agent.get_weights()["policy_weights"]
        print("type weights = ", type(weights))
        for variable, value in weights.items():
            print("Type value = ", type(value))
            value += 0.01
        agent.set_weights(weights)

        new_weights = agent.get_weights()["policy_weights"]
        recursive_assert_almost_equal(weights, new_weights)

    def test_update_from_external(self):
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        agent_config["execution_spec"].pop("ray_spec")
        environment = OpenAIGymEnv("Pong-v0", frameskip=4)

        agent = Agent.from_spec(
            agent_config,
            state_space=environment.state_space,
            action_space=environment.action_space
        )

        batch = {
            "states": agent.preprocessed_state_space.sample(200),
            "actions": environment.action_space.sample(200),
            "rewards": np.zeros(200, dtype=np.float32),
            "terminals": [False] * 200,
            "next_states": agent.preprocessed_state_space.sample(200),
            "importance_weights":  np.ones(200, dtype=np.float32)
        }

        agent.update(batch)
Exemplo n.º 18
0
class TestIMPALAAgentShortTaskLearning(unittest.TestCase):
    """
    Tests whether the DQNAgent can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(update_interval=4, batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
        )

        learn_updates = 1000
        # Setup the queue runner.
        agent.call_api_method("setup_queue_runner")
        for _ in range(learn_updates):
            agent.update()

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        #self.assertEqual(results["timesteps_executed"], time_steps)
        #self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], -3.5)
        #self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        #self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Exemplo n.º 19
0
class TestPytorchBackend(unittest.TestCase):
    """
    Tests PyTorch component execution.

    # TODO: This is a temporary test. We will later run all backend-specific
    tests via setting the executor in the component-test.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_cartpole_with_worker(self):
        env = OpenAIGymEnv("CartPole-v0")
        agent_config = config_from_path("configs/backend_performance_dqn_cartpole.json")

        # Test cpu settings for batching here.
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0"),
            agent=agent,
            frameskip=1,
            num_environments=1,
            worker_executes_preprocessing=False
        )

        result = worker.execute_timesteps(1000)
        print(result)

    def test_pong_with_worker(self):
        env_spec = dict(
            type="openai",
            gym_env="PongNoFrameskip-v4",
            # The frameskip in the agent config will trigger worker skips, this
            # is used for internal env.
            frameskip=4,
            max_num_noops=30,
            episodic_life=False
        )

        env = OpenAIGymEnv.from_spec(env_spec)
        agent_config = config_from_path("configs/backend_performance_dqn_pong.json")

        # Test cpu settings for batching here.
        agent_config["update_spec"] = None

        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=env_spec,
            agent=agent,
            frameskip=1,
            preprocessing_spec=agent_config["preprocessing_spec"],
            worker_executes_preprocessing=True
        )

        result = worker.execute_timesteps(1000)
        print(result)
Exemplo n.º 20
0
class TestPytorchBackend(unittest.TestCase):
    """
    Tests PyTorch component execution.

    # TODO: This is a temporary test. We will later run all backend-specific
    tests via setting the executor in the component-test.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_api_call_no_variables(self):
        """
        Tests define-by-run call of api method via defined_api method on a
        component without variables.
        """
        a = Dummy2To1()
        test = ComponentTest(component=a,
                             input_spaces=dict(input1=float, input2=float))
        test.test(("run", [1.0, 2.0]), expected_outputs=3.0, decimals=4)

    def test_connecting_1to2_to_2to1(self):
        """
        Adds two components with 1-to-2 and 2-to-1 graph_fns to the core, connects them and passes a value through it.
        """
        core = Component(scope="container")
        sub_comp1 = Dummy1To2(scope="comp1")  # outs=in,in+1
        sub_comp2 = Dummy2To1(scope="comp2")  # out =in1+in2
        core.add_components(sub_comp1, sub_comp2)

        @rlgraph_api(component=core)
        def run(self_, input_):
            out1, out2 = sub_comp1.run(input_)
            return sub_comp2.run(out1, out2)

        test = ComponentTest(component=core, input_spaces=dict(input_=float))

        # Expected output: input + (input + 1.0)
        test.test(("run", 100.9),
                  expected_outputs=np.array(202.8, dtype=np.float32))
        test.test(("run", -5.1),
                  expected_outputs=np.array(-9.2, dtype=np.float32))

    def test_calling_sub_components_api_from_within_graph_fn(self):
        a = DummyCallingSubComponentsAPIFromWithinGraphFn(scope="A")
        test = ComponentTest(component=a, input_spaces=dict(input_=float))

        # Expected: (1): 2*in + 10
        test.test(("run", 1.1), expected_outputs=12.2, decimals=4)

    def test_1to1_to_2to1_component_with_constant_input_value(self):
        """
        Adds two components in sequence, 1-to-1 and 2-to-1, to the core and blocks one of the api_methods of 2-to-1
        with a constant value (so that this constant value is not at the border of the root-component).
        """
        core = Component(scope="container")
        sub_comp1 = Dummy1To1(scope="A")
        sub_comp2 = Dummy2To1(scope="B")
        core.add_components(sub_comp1, sub_comp2)

        @rlgraph_api(component=core)
        def run(self_, input_):
            out = sub_comp1.run(input_)
            return sub_comp2.run(out, 1.1)

        test = ComponentTest(component=core, input_spaces=dict(input_=float))

        # Expected output: (input + 1.0) + 1.1
        test.test(("run", 78.4), expected_outputs=80.5)
        test.test(("run", -5.2), expected_outputs=-3.1)

    def test_dqn_compilation(self):
        """
        Creates a DQNAgent and runs it via a Runner on an openAI Pong Env.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)
        agent_config = config_from_path("configs/dqn_pytorch_test.json")
        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)

    def test_memory_compilation(self):
        # Builds a memory and returns build stats.
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)

        record_space = Dict(states=env.state_space,
                            actions=env.action_space,
                            rewards=float,
                            terminals=BoolBox(),
                            add_batch_rank=True)
        input_spaces = dict(
            # insert: records
            records=record_space,
            # get_records: num_records
            num_records=int,
            # update_records: indices, update
            indices=IntBox(add_batch_rank=True),
            update=FloatBox(add_batch_rank=True))

        input_spaces.pop("num_records")
        memory = MemPrioritizedReplay(capacity=20000, )
        test = ComponentTest(component=memory,
                             input_spaces=input_spaces,
                             auto_build=False)
        return test.build()

    # TODO -> batch dim works differently in pytorch -> have to squeeze.
    def test_dense_layer(self):
        # Space must contain batch dimension (otherwise, NNLayer will complain).
        space = FloatBox(shape=(2, ), add_batch_rank=True)

        # - fixed 1.0 weights, no biases
        dense_layer = DenseLayer(units=2, weights_spec=1.0, biases_spec=False)
        test = ComponentTest(component=dense_layer,
                             input_spaces=dict(inputs=space))

        # Batch of size=1 (can increase this to any larger number).
        input_ = np.array([0.5, 2.0])
        expected = np.array([2.5, 2.5])
        test.test(("apply", input_), expected_outputs=expected)

    def test_nn_assembly_from_file(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        space = FloatBox(shape=(3, ), add_batch_rank=True)

        # Create a simple neural net from json.
        neural_net = NeuralNetwork.from_spec(
            config_from_path(
                "configs/test_simple_nn.json"))  # type: NeuralNetwork

        # Do not seed, we calculate expectations manually.
        test = ComponentTest(component=neural_net,
                             input_spaces=dict(inputs=space),
                             seed=None)

        # Batch of size=3.
        input_ = np.array([[0.1, 0.2, 0.3], [1.0, 2.0, 3.0],
                           [10.0, 20.0, 30.0]])

        # Cant fetch variables here.

        out = test.test(("apply", input_), decimals=5)
        print(out)

    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(nn_input=state_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55],
                           [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=6)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            policy_params["policy/action-adapter/action-layer/dense/kernel"])
        expected_action_layer_output = np.reshape(expected_action_layer_output,
                                                  newshape=(2, 5))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions,
                                        last_internal_states=None))

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = softmax(expected_action_layer_output,
                                                axis=-1)
        test.test(("get_logits_probabilities_log_probs", states, [0, 1, 2]),
                  expected_outputs=dict(
                      logits=expected_action_layer_output,
                      probabilities=expected_probabilities_output,
                      log_probs=np.log(expected_probabilities_output)),
                  decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, ))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, ))

    def test_act(self):
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)
        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        if get_backend() == "pytorch":
            agent_config["memory_spec"]["type"] = "mem_prioritized_replay"
        agent = DQNAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)
        state = env.reset()
        action = agent.get_action(state)
        print("Component call count = {}".format(Component.call_count))

        state_space = env.state_space
        count = 200

        samples = state_space.sample(count)
        start = time.perf_counter()
        for s in samples:
            action = agent.get_action(s)
        end = time.perf_counter() - start

        print("Took {} s for {} separate actions, mean = {}".format(
            end, count, end / count))

        # Now instead test 100 batch actions
        samples = state_space.sample(count)
        start = time.perf_counter()
        action = agent.get_action(samples)
        end = time.perf_counter() - start
        print("Took {} s for {} batched actions.".format(end, count))
        profile = Component.call_times
        print_call_chain(profile, False, 0.03)

    def test_get_td_loss(self):
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)
        agent_config = config_from_path("configs/ray_apex_for_pong.json")

        # Test cpu settings for batching here.
        agent_config["memory_spec"]["type"] = "mem_prioritized_replay"
        agent_config["execution_spec"]["torch_num_threads"] = 1
        agent_config["execution_spec"]["OMP_NUM_THREADS"] = 1

        agent = ApexAgent.from_spec(
            # Uses 2015 DQN parameters as closely as possible.
            agent_config,
            state_space=env.state_space,
            # Try with "reduced" action space (actually only 3 actions, up, down, no-op)
            action_space=env.action_space)
        samples = 200
        rewards = np.random.random(size=samples)
        states = list(agent.preprocessed_state_space.sample(samples))
        actions = agent.action_space.sample(samples)
        terminals = np.zeros(samples, dtype=np.uint8)
        next_states = states[1:]
        next_states.extend([agent.preprocessed_state_space.sample(1)])
        next_states = np.asarray(next_states)
        states = np.asarray(states)
        weights = np.ones_like(rewards)

        for _ in range(1):
            start = time.perf_counter()
            _, loss_per_item = agent.get_td_loss(
                dict(states=states,
                     actions=actions,
                     rewards=rewards,
                     terminals=terminals,
                     next_states=next_states,
                     importance_weights=weights))
            print("post process time = {}".format(time.perf_counter() - start))
        profile = Component.call_times
        print_call_chain(profile, False, 0.003)
Exemplo n.º 21
0
class TestPPOShortTaskLearning(unittest.TestCase):
    """
    Tests whether the PPO agent can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)

    is_windows = os.name == "nt"

    def test_ppo_on_2x2_grid_world(self):
        """
        Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World env.
        """
        env = GridWorld(world="2x2")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=15),
        )

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=True,
            preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        # Assume we have learned something.
        self.assertGreater(results["mean_episode_reward"], -0.2)

    def test_ppo_on_2x2_grid_world_with_container_actions(self):
        """
        Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = PPOAgent.from_spec(agent_config,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=dummy_env.action_space)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print("Results =", results)

    def test_ppo_on_cart_pole(self):
        """
        Creates a PPO Agent and runs it via a Runner on the CartPole env.
        """
        env = OpenAIGymEnv("CartPole-v0", seed=36)
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        time_steps = 3000
        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertLessEqual(results["episodes_executed"], time_steps / 10)

    def test_ppo_on_pendulum(self):
        """
        Creates a PPO Agent and runs it via a Runner on the Pendulum env.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        worker = SingleThreadedWorker(env_spec=lambda: env,
                                      agent=agent,
                                      worker_executes_preprocessing=False,
                                      render=self.is_windows)
        results = worker.execute_episodes(500, use_exploration=True)

        print(results)
class TestPPOAgentFunctionality(unittest.TestCase):
    """
    Tests the PPO Agent's functionality.
    """
    root_logger.setLevel(level=logging.DEBUG)

    def test_post_processing(self):
        """
        Tests external batch post-processing for the PPO agent.
        """
        env = OpenAIGymEnv("Pong-v0",
                           frameskip=4,
                           max_num_noops=30,
                           episodic_life=True)
        agent_config = config_from_path("configs/ppo_agent_for_pong.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)
        num_samples = 200
        states = agent.preprocessed_state_space.sample(num_samples)
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)
        sequence_indices_space = BoolBox(add_batch_rank=True)

        # GAE is separately tested, just testing if this API method returns results.
        pg_advantages = agent.post_process(
            dict(states=states,
                 rewards=reward_space.sample(num_samples),
                 terminals=terminal_space.sample(num_samples, fill_value=0),
                 sequence_indices=sequence_indices_space.sample(num_samples,
                                                                fill_value=0)))

    def test_ppo_on_container_state_and_action_spaces_and_very_large_rewards(
            self):
        """
        Tests stability of PPO on an extreme env producing strange container states and large rewards and requiring
        container actions.
        """
        env = RandomEnv(
            state_space=Dict(
                {"F_position": FloatBox(shape=(2, ), low=0.01, high=0.02)}),
            action_space=Dict({
                "F_direction_low-1.0_high1.0":
                FloatBox(shape=(), low=-1.0, high=1.0),
                "F_forward_direction_low-1.0_high1.0":
                FloatBox(shape=(), low=-1.0, high=1.0),
                "B_jump":
                BoolBox()
            }),
            reward_space=FloatBox(low=-1000.0,
                                  high=-100000.0),  # hugely negative rewards
            terminal_prob=0.0000001)

        agent_config = config_from_path(
            "configs/ppo_agent_for_random_env_with_container_spaces.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            preprocessing_spec=None,
            worker_executes_preprocessing=True,
            #episode_finish_callback=lambda episode_return, duration, timesteps, env_num:
            #print("episode return {}; steps={}".format(episode_return, timesteps))
        )
        results = worker.execute_timesteps(num_timesteps=int(1e6),
                                           use_exploration=True)

        print(results)
Exemplo n.º 23
0
class TestGraphFns(unittest.TestCase):
    """
    Tests for different ways to send DataOps through GraphFunctions.
    Tests flattening, splitting, etc.. operations.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_2_containers_flattening_splitting(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes two containers through it
        with flatten/split options enabled.
        """
        input1_space = spaces.Dict(a=float, b=spaces.FloatBox(shape=(1, 2)))
        input2_space = spaces.Dict(a=float, b=float)

        component = FlattenSplitDummy()
        test = ComponentTest(component=component,
                             input_spaces=dict(input1=input1_space,
                                               input2=input2_space))

        # Options: fsu=flat/split/un-flat.
        in1_fsu = dict(a=np.array(0.234), b=np.array([[0.0, 3.0]]))
        in2_fsu = dict(a=np.array(5.0), b=np.array(5.5))
        # Result of sending 'a' keys through graph_fn: (in1[a]+1.0=1.234, in1[a]+in2[a]=5.234)
        # Result of sending 'b' keys through graph_fn: (in1[b]+1.0=[[1, 4]], in1[b]+in2[b]=[[5.5, 8.5]])
        out1_fsu = dict(a=1.234, b=np.array([[1.0, 4.0]]))
        out2_fsu = dict(a=np.array(5.234, dtype=np.float32),
                        b=np.array([[5.5, 8.5]]))
        test.test(("run", [in1_fsu, in2_fsu]),
                  expected_outputs=[out1_fsu, out2_fsu])

    def test_1_containers_1_float_flattening_splitting(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes one container and one float through it
        with flatten/split options all disabled.
        """
        input1_space = spaces.Dict(a=float, b=spaces.FloatBox(shape=(1, 2)))
        input2_space = spaces.FloatBox(shape=(1, 1))

        component = FlattenSplitDummy()
        test = ComponentTest(component=component,
                             input_spaces=dict(input1=input1_space,
                                               input2=input2_space))

        # Options: fsu=flat/split/un-flat.
        in1_fsu = dict(a=np.array(0.234), b=np.array([[0.0, 3.0]]))
        in2_fsu = np.array([[2.0]])
        # Result of sending 'a' keys through graph_fn: (in1[a]+1.0=1.234, in1[a]+in2=2.234)
        # Result of sending 'b' keys through graph_fn: (in1[b]+1.0=[[1, 4]], in1[b]+in2=[[2.0, 5.0]])
        out1_fsu = dict(a=1.234, b=np.array([[1.0, 4.0]]))
        out2_fsu = dict(a=np.array([[2.234]], dtype=np.float32),
                        b=np.array([[2.0, 5.0]]))
        test.test(("run", [in1_fsu, in2_fsu]),
                  expected_outputs=[out1_fsu, out2_fsu])

    def test_2_containers_no_options(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes one container and one float through it
        with no flatten/split options enabled.
        """
        input1_space = spaces.Dict(a=int, b=bool)
        input2_space = spaces.Dict(c=bool, d=int)

        component = NoFlattenNoSplitDummy()
        test = ComponentTest(component=component,
                             input_spaces=dict(input1=input1_space,
                                               input2=input2_space))

        # Options: fsu=flat/split.
        in1 = dict(a=5, b=True)
        in2 = dict(c=False, d=3)
        # Expect reversal (see graph_fn)
        out1 = in2
        out2 = in1
        test.test(("run", [in1, in2]), expected_outputs=[out1, out2])

    def test_1_container_1_float_only_flatten(self):
        """
        Adds a single component with 2-to-3 graph_fn to the core and passes one container and one float through it
        with only the flatten option enabled.
        """
        input1_space = spaces.Dict(a=float, b=float, c=spaces.Tuple(float))
        input2_space = spaces.FloatBox(shape=(1, ))

        component = OnlyFlattenDummy(constant_value=5.0)
        test = ComponentTest(component=component,
                             input_spaces=dict(input1=input1_space,
                                               input2=input2_space))

        # Options: only flatten_ops=True.
        in1 = dict(a=5.4, b=3.4, c=tuple([3.2]))
        in2 = np.array([1.2])
        # out1: dict(in1_f key: in1_f value + in2_f[""])
        # out2: in2_f
        # out3: self.constant_value
        out1 = dict(a=in1["a"] + in2,
                    b=in1["b"] + in2,
                    c=tuple([in1["c"][0] + in2]))
        out2 = dict(a=in1["a"] - in2,
                    b=in1["b"] - in2,
                    c=tuple([in1["c"][0] - in2]))
        out3 = in2
        test.test(("run", [in1, in2]),
                  expected_outputs=[out1, out2, out3],
                  decimals=5)

    def test_calling_graph_fn_from_inside_another_graph_fn(self):
        """
        One graph_fn gets called from within another. Must return actual ops from inner one so that the outer one
        can handle it.
        """
        input_space = spaces.FloatBox(shape=(2, ))
        component = Dummy2NestedGraphFnCalls()
        test = ComponentTest(component=component,
                             input_spaces=dict(input_=input_space))

        input_ = input_space.sample()
        expected = input_ - 1.0
        test.test(("run", input_), expected_outputs=expected, decimals=5)

    def test_component_that_defines_custom_graph_fns(self):
        a = DummyThatDefinesCustomGraphFn()

        test = ComponentTest(component=a, input_spaces=dict(input_=float))

        test.test(("run", 3.4567), expected_outputs=3.4567, decimals=3)

    def test_calling_graph_fn_with_default_args_in_middle(self):
        a = Dummy3To1WithDefaultValues()
        test = ComponentTest(component=a, input_spaces=dict(input1=float))
        # Will put default float into input2.
        test.test(("run", 1.0), expected_outputs=2.0, decimals=3)

        b = Dummy3To1WithDefaultValues()
        test = ComponentTest(component=b,
                             input_spaces=dict(input1=int, input3=int))
        test.test(("run", [5, 6]), expected_outputs=6, decimals=0)

        c = Dummy3To1WithDefaultValues()
        test = ComponentTest(
            component=c, input_spaces=dict(input1=float, input4=float)
        )  # TODO: if we leave out input4, should create a default-placeholder with default value = 1.0 (see api-method)
        test.test(("run2", [1.0, 2.0]), expected_outputs=3.0, decimals=3)
Exemplo n.º 24
0
class TestDQNAgentFunctionality(unittest.TestCase):
    """
    Tests the DQN Agent's functionality.
    """
    root_logger.setLevel(level=logging.DEBUG)

    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            discount=0.95)
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld(world="2x2", save_mode=True),
            agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python",
                                    double_q=True,
                                    discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(loss_per_item=[
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.IntBox(4, add_batch_rank=True),
            spaces.FloatBox(add_batch_rank=True),
            spaces.BoolBox(add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True)
        ]),
                                      action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]],
                         key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states",
                       np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/actions",
            np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0] * 4 +  # + [-3.0] +
                     [0.0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] * 2 + [False] *
                     (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0],
                                               [1.0, 0.0, 0.0, 0.0]]),
                              actions=np.array([0, 1]),
                              rewards=np.array([-1.0, -3.0]),
                              terminals=np.array([False, True]),
                              next_states=np.array([[1.0, 0.0, 0.0, 0.0],
                                                    [0.0, 0.0, 0.0, 0.0]]))
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet,
                                                 matrix1_target_net,
                                                 matrix2_target_net, agent,
                                                 loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False, True] * 2 + [True, False]))
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index",
                       1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=4)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])
            # TODO: <- This is wrong and must be fixed
            # (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent,
                                                 loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=2)  # again: old matrix
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            mat_updated[1],
            decimals=2)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=2)

    def _calculate_action(self, state, matrix1, matrix2):
        s = np.asarray([state])
        s_flat = one_hot(s, depth=4)
        q_values = self._helper_get_q_values(s_flat, matrix1, matrix2)
        # Assume greedy.
        return np.argmax(q_values)

    @staticmethod
    def _helper_get_q_values(input_, matrix1, matrix2):
        """
        Calculates the q-values for a given simple 1-hidden 1-action-layer (both linear w/o biases) setup.

        Args:
            input_ (np.ndarray): The input array (batch x in-nodes).
            matrix1 (np.ndarray): The weights matrix of the hidden layer.
            matrix2 (np.ndarray): The weights matrix of the action-layer.

        Returns:
            np.ndarray: The calculated q-values.
        """
        # Simple NN implementation.
        nn_output = np.matmul(np.matmul(input_, matrix1), matrix2)
        # Simple dueling layer implementation.
        state_values = np.expand_dims(nn_output[:, 0], axis=-1)
        q_values = state_values + nn_output[:, 1:] - np.mean(
            nn_output[:, 1:], axis=-1, keepdims=True)
        return q_values

    def _helper_update_matrix(self, expected_batch, matrix1_qnet, matrix2_qnet,
                              matrix1_target_net, matrix2_target_net, agent,
                              loss_func):
        # Calculate gradient per weight based on the above batch.
        q_s = self._helper_get_q_values(expected_batch["states"], matrix1_qnet,
                                        matrix2_qnet)
        q_sp = self._helper_get_q_values(expected_batch["next_states"],
                                         matrix1_qnet, matrix2_qnet)
        qt_sp = self._helper_get_q_values(expected_batch["next_states"],
                                          matrix1_target_net,
                                          matrix2_target_net)

        # The loss without weight changes.
        loss = np.mean(
            loss_func._graph_fn_loss_per_item(q_s, expected_batch["actions"],
                                              expected_batch["rewards"],
                                              expected_batch["terminals"],
                                              qt_sp, q_sp))

        # Calculate the dLoss/dw for all individual weights (w) and apply [- LR * dLoss/dw] to each weight.
        # Then check again against the actual, now optimized weights.
        mat_updated = list()
        for i, mat in enumerate([matrix1_qnet, matrix2_qnet]):
            mat_updated.append(mat.copy())
            for index in np.ndindex(mat.shape):
                mat_w_plus_d = mat.copy()
                mat_w_plus_d[index] += 0.0001
                if i == 0:
                    q_s_plus_d = self._helper_get_q_values(
                        expected_batch["states"], mat_w_plus_d, matrix2_qnet)
                    q_sp_plus_d = self._helper_get_q_values(
                        expected_batch["next_states"], mat_w_plus_d,
                        matrix2_qnet)
                else:
                    q_s_plus_d = self._helper_get_q_values(
                        expected_batch["states"], matrix1_qnet, mat_w_plus_d)
                    q_sp_plus_d = self._helper_get_q_values(
                        expected_batch["next_states"], matrix1_qnet,
                        mat_w_plus_d)

                loss_w_plus_d = np.mean(
                    loss_func._graph_fn_loss_per_item(
                        q_s_plus_d, expected_batch["actions"],
                        expected_batch["rewards"], expected_batch["terminals"],
                        qt_sp, q_sp_plus_d))
                dl_over_dw = (loss - loss_w_plus_d) / 0.0001

                # Apply the changes to our matrices, then check their actual values.
                mat_updated[i][
                    index] += agent.optimizer.learning_rate * dl_over_dw

        return mat_updated
class TestIMPALAAgentShortTaskLearning(unittest.TestCase):
    """
    Tests whether the DQNAgent can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)

    is_windows = os.name == "nt"

    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05))

        learn_updates = 50
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(
                i, float(ret[1]), mean_return))

        # Assume we have learned something.
        self.assertGreater(mean_return, -0.1)

        # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start).
        action_probs = ret[3]["action_probs"].reshape((80, 4))
        next_states = ret[3]["states"][:, 1:].reshape((80, ))
        for s_, probs in zip(next_states, action_probs):
            # Start state:
            # - Assume we picked "right" in state=1 (in order to step into goal state).
            # - OR we picked "up" or "left" in state=0 (unlikely, but possible).
            if s_ == 0:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)
            # One below start:
            # - Assume we picked "down" in start state with very large probability.
            # - OR we picked "left" or "down" in state=1 (unlikely, but possible).
            elif s_ == 1:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)

        agent.terminate()

    def test_impala_on_cart_pole(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on CartPole-v0.
        """
        env_spec = dict(type="open-ai-gym",
                        gym_env="CartPole-v0",
                        seed=10,
                        visualize=self.is_windows)
        config_ = config_from_path("configs/impala_agent_for_cartpole.json")
        config_["environment_spec"] = env_spec
        dummy_env = OpenAIGymEnv.from_spec(env_spec)
        agent = IMPALAAgent.from_spec(config_,
                                      state_space=dummy_env.state_space,
                                      action_space=dummy_env.action_space,
                                      execution_spec=dict(seed=10))

        learn_updates = 300
        mean_returns = []
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            mean_returns.append(mean_return)
            print("i={}/{} Loss={:.4} Avg-reward={:.2}".format(
                i, learn_updates, float(ret[1]), mean_return))

        # Assume we have learned something.
        average_return_last_n_episodes = np.nanmean(mean_returns[:-100])
        print("Average return over last n episodes: {}".format(
            average_return_last_n_episodes))
        self.assertGreater(average_return_last_n_episodes, 30.0)

        time.sleep(3)
        agent.terminate()
        time.sleep(3)

    @staticmethod
    def _calc_mean_return(records):
        size = records[3]["rewards"].size
        rewards = records[3]["rewards"].reshape((size, ))
        terminals = records[3]["terminals"].reshape((size, ))
        returns = list()
        return_ = 0.0
        for r, t in zip(rewards, terminals):
            return_ += r
            if t:
                returns.append(return_)
                return_ = 0.0

        return np.mean(returns)
Exemplo n.º 26
0
class TestSACAgentFunctionality(unittest.TestCase):
    """
    Tests the SAC Agent's functionality.
    """
    root_logger.setLevel(level=logging.DEBUG)

    def test_sac_agent_component_functionality(self):
        config = config_from_path(
            "configs/sac_component_for_fake_env_test.json")

        # Arbitrary state space, state should not be used in this example.
        state_space = FloatBox(shape=(8, ))
        continuous_action_space = FloatBox(shape=(1, ), low=-2.0, high=2.0)
        terminal_space = BoolBox(add_batch_rank=True)
        rewards_space = FloatBox(add_batch_rank=True)
        policy = Policy.from_spec(config["policy"],
                                  action_space=continuous_action_space)
        policy.add_components(Synchronizable(), expose_apis="sync")
        q_function = ValueFunction.from_spec(config["value_function"])

        agent_component = SACAgentComponent(
            agent=None,
            policy=policy,
            q_function=q_function,
            preprocessor=PreprocessorStack.from_spec([]),
            memory=ReplayMemory.from_spec(config["memory"]),
            discount=config["discount"],
            initial_alpha=config["initial_alpha"],
            target_entropy=None,
            optimizer=AdamOptimizer.from_spec(config["optimizer"]),
            vf_optimizer=AdamOptimizer.from_spec(
                config["value_function_optimizer"], scope="vf-optimizer"),
            alpha_optimizer=None,
            q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0),
            num_q_functions=2)

        test = ComponentTest(
            component=agent_component,
            input_spaces=dict(
                states=state_space.with_batch_rank(),
                preprocessed_states=state_space.with_batch_rank(),
                env_actions=continuous_action_space.with_batch_rank(),
                actions=continuous_action_space.with_batch_rank(),
                rewards=rewards_space,
                next_states=state_space.with_batch_rank(),
                terminals=terminal_space,
                batch_size=int,
                preprocessed_s_prime=state_space.with_batch_rank(),
                importance_weights=FloatBox(add_batch_rank=True),
                preprocessed_next_states=state_space.with_batch_rank(),
                deterministic=bool,
                weights="variables:{}".format(policy.scope),
                # TODO: how to provide the space for multiple component variables?
                #q_weights=Dict(
                #    q_0="variables:{}".format(q_function.scope),
                #    q_1="variables:{}".format(agent_component._q_functions[1].scope),
                #)
            ),
            action_space=continuous_action_space,
            build_kwargs=dict(
                optimizer=agent_component._optimizer,
                build_options=dict(
                    vf_optimizer=agent_component.vf_optimizer, ),
            ))

        batch_size = 10
        action_sample = continuous_action_space.with_batch_rank().sample(
            batch_size)
        rewards = rewards_space.sample(batch_size)
        # Check, whether an update runs ok.
        result = test.test((
            "update_from_external_batch",
            [
                state_space.sample(batch_size),
                action_sample,
                rewards,
                [True] * batch_size,
                state_space.sample(batch_size),
                [1.0] * batch_size  # importance
            ]))
        self.assertTrue(result["actor_loss"].dtype == np.float32)
        self.assertTrue(result["critic_loss"].dtype == np.float32)

        action_sample = np.linspace(-1, 1, batch_size).reshape((batch_size, 1))
        q_values = test.test(
            ("get_q_values", [state_space.sample(batch_size), action_sample]))
        for q_val in q_values:
            self.assertTrue(q_val.dtype == np.float32)
            self.assertTrue(q_val.shape == (batch_size, 1))

        action_sample, _ = test.test(("action_from_preprocessed_state",
                                      [state_space.sample(batch_size), False]))
        self.assertTrue(action_sample.dtype == np.float32)
        self.assertTrue(action_sample.shape == (batch_size, 1))
Exemplo n.º 27
0
    def __init__(
            self,
            component,
            input_spaces=None,
            action_space=None,
            seed=10,
            logging_level=None,
            execution_spec=None,
            # TODO: Move all the below into execution_spec just like for Agent class.
            enable_profiler=False,
            disable_monitoring=False,
            device_strategy="default",
            device_map=None,
            backend=None,
            auto_build=True,
            build_kwargs=None):
        """
        Args:
            component (Component): The Component to be tested (may contain sub-components).
            input_spaces (Optional[dict]): Dict with component's API input-parameter' names as keys and Space objects
                or Space specs as values. Describes the input Spaces for the component.
                None, if the Component to be tested has no API methods with input parameters.
            action_space (Optional[Space]): The action space to pass into the GraphBuilder.
            seed (Optional[int]): The seed to use for random-seeding the Model object.
                If None, do not seed the Graph (things may behave non-deterministically).
            logging_level (Optional[int]): When provided, sets RLGraph's root_logger's logging level to this value.
            execution_spec (Optional[dict]): Specification dict for execution settings.
            enable_profiler (bool): When enabled, activates backend profiling. Default: False.
            disable_monitoring (bool): When True, will not use a monitored session. Default: False.
            device_strategy (str): Optional device-strategy to be passed into GraphExecutor.
            device_map (Optional[Dict[str,str]]): Optional device-map to be passed into GraphExecutor.
            backend (Optional[str]): Override global backend settings for a test by passing in a specific
                backend, convenience method.
            auto_build (Optional[bool]): If false, build has to be triggered manually to eval build stats.
            build_kwargs (Optional[dict]): Dict to be passed as **kwargs to the call to `self.graph_executor.build`.
        """
        self.seed = seed
        np.random.seed(seed)
        random.seed(seed)

        if logging_level is not None:
            root_logger.setLevel(logging_level)

        # Create a GraphBuilder.
        self.graph_builder = GraphBuilder(action_space=action_space)
        self.component = component
        self.component.nesting_level = 0
        self.input_spaces = input_spaces
        self.build_kwargs = build_kwargs or dict()

        # Build the model.
        execution_spec = parse_execution_spec(
            execution_spec or dict(seed=self.seed,
                                   enable_profiler=enable_profiler,
                                   profiler_frequency=1,
                                   device_strategy=device_strategy,
                                   disable_monitoring=disable_monitoring,
                                   device_map=device_map))
        use_backend = backend if backend is not None else get_backend()
        self.graph_executor = GraphExecutor.from_spec(
            use_backend,
            graph_builder=self.graph_builder,
            execution_spec=execution_spec)
        if auto_build:
            self.build()
        else:
            print("Auto-build false, did not build. Waiting for manual build.")
Exemplo n.º 28
0
class TestDQNAgentShortTaskLearning(unittest.TestCase):
    """
    Tests whether the DQNAgent can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)
    grid_world_preprocessing_spec = [dict(type="reshape", flatten=True)]
    # Preprocessed state spaces.
    grid_world_2x2_flattened_state_space = FloatBox(shape=(4, ),
                                                    add_batch_rank=True)
    grid_world_4x4_flattened_state_space = FloatBox(shape=(16, ),
                                                    add_batch_rank=True)
    is_windows = os.name == "nt"

    def test_dqn_on_2x2_grid_world(self):
        """
        Creates a DQNAgent and runs it via a Runner on a simple 2x2 GridWorld.
        """
        dummy_env = GridWorld("2x2")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_2x2_gridworld.json"),
            double_q=False,
            dueling_q=False,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
            observe_spec=dict(buffer_size=100),
            execution_spec=dict(seed=12),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 1000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld("2x2"),
            agent=agent,
            preprocessing_spec=self.grid_world_preprocessing_spec,
            worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -3.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)

    def test_double_dqn_on_2x2_grid_world(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld.
        """
        dummy_env = GridWorld("2x2")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_2x2_gridworld.json"),
            dueling_q=False,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
            observe_spec=dict(buffer_size=100),
            execution_spec=dict(seed=10),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 1000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld("2x2"),
            agent=agent,
            preprocessing_spec=self.grid_world_preprocessing_spec,
            worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)

    def test_double_dqn_on_4x4_grid_world(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld.
        """
        dummy_env = GridWorld("4x4")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_4x4_gridworld.json"),
            dueling_q=False,
            state_space=self.grid_world_4x4_flattened_state_space,
            action_space=dummy_env.action_space,
            observe_spec=dict(buffer_size=100),
            execution_spec=dict(seed=10),
            update_spec=dict(update_interval=4,
                             batch_size=32,
                             sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.005),
            store_last_q_table=True)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld("4x4"),
            agent=agent,
            preprocessing_spec=self.grid_world_preprocessing_spec,
            worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -10)
        self.assertGreaterEqual(results["max_episode_reward"], -4)
        self.assertLessEqual(results["episodes_executed"], 1000)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0):
            (-5, -4, -4, -4),  # 0
            (0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0):
            (-5, -5, -3, -4),  # 1
            (0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0):
            (-4, -2, -5, -3),  # 2
            # 3=terminal
            (0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0):
            (-4, -3, -5, -5),  # 4
            # 5=terminal
            (0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0): (-5, -1, -1,
                                                                 -3),  # 6
            (0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0): (-2, 0, -1,
                                                                 -5),  # 7
            (0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0): (-3, -4, -2,
                                                                 -4),  # 8
            (0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0): (-3, -5, -1,
                                                                 -5),  # 9
            (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0): (-2, -5, 0,
                                                                 -2),  # 10
            (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0): (-1, 1, 0,
                                                                 -1),  # 11
            (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0): (-4, -4, -5,
                                                                 -3),  # 12
            # 13=terminal
            # 14=terminal
            # 15=terminal
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)

    def test_dqn_on_cart_pole(self):
        """
        Creates a DQNAgent and runs it via a Runner on the CartPole Env.
        """
        dummy_env = OpenAIGymEnv("CartPole-v0")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_cartpole.json"),
            double_q=False,
            dueling_q=False,
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            observe_spec=dict(buffer_size=200),
            execution_spec=dict(seed=15),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=64),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=15),
            agent=agent,
            render=self.is_windows,
            worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 25)
        self.assertGreaterEqual(results["max_episode_reward"], 100.0)
        self.assertLessEqual(results["episodes_executed"], 200)

    def test_double_dueling_dqn_on_cart_pole(self):
        """
        Creates a double and dueling DQNAgent and runs it via a Runner on the CartPole Env.
        """
        dummy_env = OpenAIGymEnv("CartPole-v0")
        agent = DQNAgent.from_spec(
            config_from_path("configs/dqn_agent_for_cartpole.json"),
            double_q=True,
            dueling_q=True,
            state_space=dummy_env.state_space,
            action_space=dummy_env.action_space,
            observe_spec=dict(buffer_size=200),
            execution_spec=dict(seed=156),
            update_spec=dict(update_interval=4,
                             batch_size=64,
                             sync_interval=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: OpenAIGymEnv("CartPole-v0", seed=10),
            agent=agent,
            render=self.is_windows,
            worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], 15)
        self.assertGreaterEqual(results["max_episode_reward"], 160.0)
        self.assertLessEqual(results["episodes_executed"], 100)
Exemplo n.º 29
0
class TestSACShortTaskLearning(unittest.TestCase):
    """
    Tests whether the SACAgent and the SACAgentComponent can learn in simple environments.
    """
    root_logger.setLevel(level=logging.INFO)

    is_windows = os.name == "nt"

    def test_sac_agent_component_on_fake_env(self):
        config = config_from_path("configs/sac_component_for_fake_env_test.json")

        # Arbitrary state space, state should not be used in this example.
        state_space = FloatBox(shape=(2,))
        continuous_action_space = FloatBox(low=-1.0, high=1.0)
        terminal_space = BoolBox(add_batch_rank=True)
        policy = Policy.from_spec(config["policy"], action_space=continuous_action_space)
        policy.add_components(Synchronizable(), expose_apis="sync")
        q_function = ValueFunction.from_spec(config["value_function"])

        agent_component = SACAgentComponent(
            agent=None,
            policy=policy,
            q_function=q_function,
            preprocessor=PreprocessorStack.from_spec([]),
            memory=ReplayMemory.from_spec(config["memory"]),
            discount=config["discount"],
            initial_alpha=config["initial_alpha"],
            target_entropy=None,
            optimizer=AdamOptimizer.from_spec(config["optimizer"]),
            vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"),
            alpha_optimizer=None,
            q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0),
            num_q_functions=2
        )

        test = ComponentTest(
            component=agent_component,
            input_spaces=dict(
                states=state_space.with_batch_rank(),
                preprocessed_states=state_space.with_batch_rank(),
                actions=continuous_action_space.with_batch_rank(),
                rewards=FloatBox(add_batch_rank=True),
                next_states=state_space.with_batch_rank(),
                terminals=terminal_space,
                batch_size=int,
                preprocessed_s_prime=state_space.with_batch_rank(),
                importance_weights=FloatBox(add_batch_rank=True),
                preprocessed_next_states=state_space.with_batch_rank(),
                deterministic=bool,
                weights="variables:{}".format(policy.scope),
                # TODO: how to provide the space for multiple component variables?
                # q_weights=Dict(
                #    q_0="variables:{}".format(q_function.scope),
                #    q_1="variables:{}".format(agent_component._q_functions[1].scope),
                # )
            ),
            action_space=continuous_action_space,
            build_kwargs=dict(
                optimizer=agent_component._optimizer,
                build_options=dict(
                    vf_optimizer=agent_component.vf_optimizer,
                ),
            )
        )

        policy_loss = []
        vf_loss = []

        # This test simulates an env that always requires actions to be close to the max-pdf
        # value of a loc=0.5, scale=0.2 normal, regardless of any state inputs.
        # The component should learn to produce actions like that (close to 0.5).
        true_mean = 0.5
        target_dist = stats.norm(loc=true_mean, scale=0.2)
        batch_size = 100
        for _ in range(5000):
            action_sample = continuous_action_space.sample(batch_size)
            rewards = target_dist.pdf(action_sample)
            result = test.test(("update_from_external_batch", [
                state_space.sample(batch_size),
                action_sample,
                rewards,
                [True] * batch_size,
                state_space.sample(batch_size),
                [1.0] * batch_size  # importance
            ]))
            policy_loss.append(result["actor_loss"])
            vf_loss.append(result["critic_loss"])

        self.assertTrue(np.mean(policy_loss[:100]) > np.mean(policy_loss[-100:]))
        self.assertTrue(np.mean(vf_loss[:100]) > np.mean(vf_loss[-100:]))

        action_sample = np.linspace(-1, 1, batch_size)
        q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample]))
        for q_val in q_values:
            q_val = q_val.flatten()
            np.testing.assert_allclose(q_val, target_dist.pdf(action_sample), atol=0.2)

        action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False]))
        action_sample = action_sample.flatten()
        np.testing.assert_allclose(np.mean(action_sample), true_mean, atol=0.1)

    def test_sac_learning_on_gaussian_density_as_reward_env(self):
        """
        Creates an SAC-Agent and runs it via a Runner on the GaussianDensityAsRewardEnv.
        """
        env = GaussianDensityAsRewardEnv(episode_length=5)
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_gaussian_density_env.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent)
        worker.execute_episodes(num_episodes=500)
        rewards = worker.finished_episode_rewards[0]  # 0=1st env in vector-env
        self.assertTrue(np.mean(rewards[:100]) < np.mean(rewards[-100:]))

        worker.execute_episodes(num_episodes=100, use_exploration=False, update_spec=None)
        rewards = worker.finished_episode_rewards[0]
        self.assertTrue(len(rewards) == 100)
        evaluation_score = np.mean(rewards)
        self.assertTrue(.5 * env.get_max_reward() < evaluation_score <= env.get_max_reward())

    def test_sac_on_pendulum(self):
        """
        Creates an SAC-Agent and runs it on Pendulum.
        """
        env = OpenAIGymEnv("Pendulum-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_pendulum.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=self.is_windows
        )
        # Note: SAC is more computationally expensive.
        episodes = 50
        results = worker.execute_episodes(episodes)

        print(results)

        self.assertTrue(results["timesteps_executed"] == episodes * 200)
        self.assertTrue(results["episodes_executed"] == episodes)
        self.assertGreater(results["mean_episode_reward"], -800)

    def test_sac_on_cartpole(self):
        """
        Creates an SAC-Agent and runs it on CartPole.
        """
        env = OpenAIGymEnv("CartPole-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space
        )

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=False,
            render=self.is_windows
        )

        time_steps = 10000
        results = worker.execute_timesteps(time_steps)

        print(results)

    def test_sac_2x2_grid_world_with_container_actions(self):
        """
        Creates a SAC agent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path("configs/sac_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = SACAgent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(4,)),
            action_space=dummy_env.action_space,
        )

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=False,
            render=False
        )
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print(results)

    def test_sac_cartpole_on_ray(self):
        """
        Tests sac on Ape-X.
        """
        # Import Ray here so other test cases do not need to import it if not installed.
        from rlgraph.execution.ray import ApexExecutor
        env_spec = dict(
            type="openai",
            gym_env="CartPole-v0"
        )
        agent_config = config_from_path("configs/sac_cartpole_on_apex.json")
        executor = ApexExecutor(
            environment_spec=env_spec,
            agent_config=agent_config,
        )
        # Define executor, test assembly.
        print("Successfully created executor.")

        # Executes actual workload.
        result = executor.execute_workload(workload=dict(num_timesteps=20000, report_interval=1000,
                                                         report_interval_min_seconds=1))
        print("Finished executing workload:")
        print(result)
Exemplo n.º 30
0
class TestDevicePlacements(unittest.TestCase):
    """
    Tests different ways to place Components and their ops/variables on different devices.
    """
    root_logger.setLevel(level=logging.INFO)

    def test_single_component(self):
        """
        Place the entire Component on its own device.
        """
        a = Dummy1To1(scope="A", device="/device:CPU:0")
        test = ComponentTest(component=a, input_spaces=dict(input_=float))
        # Actually check the device of the ops in a.
        self.assertEqual(
            a.api_methods["run"].in_op_columns[0].op_records[0].op.device,
            "/device:CPU:0")
        self.assertEqual(
            a.api_methods["run"].out_op_columns[0].op_records[0].op.device,
            "/device:CPU:0")
        # Expected: in + 1.0
        test.test(("run", 1.1), expected_outputs=2.1)

    def test_single_component_with_variables(self):
        """
        Place variables on CPU, ops on GPU (if exists).
        """
        var_device = "/device:CPU:0"
        op_device = "/device:GPU:0"
        a = DummyWithVar(scope="A",
                         device=dict(variables=var_device, ops=op_device))
        test = ComponentTest(component=a, input_spaces=dict(input_=float))

        # Vars -> CPU.
        self.assertEqual(a.variables["A/constant-variable"].device, var_device)
        # Placeholders -> GPU.
        self.assertEqual(
            a.api_methods["run_plus"].in_op_columns[0].op_records[0].op.device,
            op_device)
        self.assertEqual(
            a.api_methods["run_minus"].in_op_columns[0].op_records[0].op.
            device, op_device)
        # Actual ops -> GPU.
        self.assertEqual(
            a.api_methods["run_plus"].out_op_columns[0].op_records[0].op.
            device, op_device)
        self.assertEqual(
            a.api_methods["run_minus"].out_op_columns[0].op_records[0].op.
            device, op_device)

        # Expected: in + 2.0
        test.test(("run_plus", 1.1), expected_outputs=3.1)

    def test_sub_components_with_device_map(self):
        """
        Place variables on CPU, ops on GPU (if exists).
        """
        a = DummyWithSubComponents(scope="A")
        comp_device = "/device:GPU:0"
        sub_comp_device = "/device:CPU:0"
        test = ComponentTest(component=a,
                             input_spaces=dict(input_=float),
                             device_strategy="custom",
                             device_map=dict({
                                 "A/dummy-with-var": sub_comp_device,
                                 "A": comp_device
                             }))
        # Actually check the device of the variables and ops in a.
        actual_comp_device = "/device:GPU:0" if "/device:GPU:0" in test.graph_builder.available_devices else \
            "/device:CPU:0"
        self.assertEqual(
            a.api_methods["run1"].in_op_columns[0].op_records[0].op.device,
            actual_comp_device)
        self.assertEqual(
            a.api_methods["run1"].out_op_columns[0].op_records[0].op.device,
            sub_comp_device)
        self.assertEqual(
            a.api_methods["run1"].out_op_columns[0].op_records[1].op.device,
            actual_comp_device)
        self.assertEqual(
            a.api_methods["run2"].in_op_columns[0].op_records[0].op.device,
            actual_comp_device)
        self.assertEqual(
            a.api_methods["run2"].out_op_columns[0].op_records[0].op.device,
            actual_comp_device)

        test.test(("run1", 1.1), expected_outputs=[3.1, 4.1])
        test.test(("run2", -1.1), expected_outputs=-2.1)