Пример #1
0
    def test_latest_batch(self):
        """
        Tests if we can fetch latest steps.
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)

            # Insert 5 random elements.
            observation = non_terminal_records(self.record_space, 5)
            test.test(("insert_records", observation), expected_outputs=None)

            # First, test if the basic computation works.
            batch = test.test(("get_records", 5), expected_outputs=None)
            recursive_assert_almost_equal(batch, observation)

            # Next, insert capacity more elements:
            observation = non_terminal_records(self.record_space, self.capacity)
            test.test(("insert_records", observation), expected_outputs=None)

            # If we now fetch capacity elements, we expect to see exactly the last 10.
            batch = test.test(("get_records", self.capacity), expected_outputs=None)
            recursive_assert_almost_equal(batch, observation)

            # If we fetch n elements, we expect to see exactly the last n.
            for last_n in range(1, 6):
                batch = test.test(("get_records", last_n), expected_outputs=None)
                recursive_assert_almost_equal(batch["actions"]["action1"], observation["actions"]["action1"][-last_n:])
                recursive_assert_almost_equal(batch["states"]["state2"], observation["states"]["state2"][-last_n:])
                recursive_assert_almost_equal(batch["terminals"], observation["terminals"][-last_n:])
Пример #2
0
 def assert_equal(outs, expected_outputs, decimals=7):
     """
     Convenience wrapper: See implementation of `recursive_assert_almost_equal` for details.
     """
     recursive_assert_almost_equal(outs,
                                   expected_outputs,
                                   decimals=decimals)
Пример #3
0
    def test_value_function_weights(self):
        """
        Tests changing of value function weights.
        """
        env = OpenAIGymEnv("Pong-v0")
        agent_config = config_from_path("configs/ppo_agent_for_pong.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)
        weights = agent.get_weights()
        assert "value_function_weights" in weights
        assert "policy_weights" in weights

        policy_weights = weights["policy_weights"]
        value_function_weights = weights["value_function_weights"]

        # Just change vf weights.
        for key, weight in value_function_weights.items():
            value_function_weights[key] = weight + 0.01
        agent.set_weights(policy_weights, value_function_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(
            new_actual_weights["value_function_weights"],
            value_function_weights)
Пример #4
0
    def test_demos_with_container_actions(self):
        # Tests if dqfd can fit a set of states to a set of actions.
        vocab_size = 100
        embed_dim = 128
        # ID/state space.
        state_space = IntBox(vocab_size, shape=(10, ))
        # Container action space.
        actions_space = {}
        num_outputs = 3
        for i in range(3):
            actions_space['action_{}'.format(i)] = IntBox(low=0,
                                                          high=num_outputs)
        actions_space = Dict(actions_space)

        agent_config = config_from_path("configs/dqfd_container.json")
        agent_config["network_spec"] = [
            dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size),
            dict(type="reshape", flatten=True),
            dict(type="dense",
                 units=embed_dim,
                 activation="relu",
                 scope="dense_1")
        ]
        agent = DQFDAgent.from_spec(agent_config,
                                    state_space=state_space,
                                    action_space=actions_space)
        terminals = BoolBox(add_batch_rank=True)
        rewards = FloatBox(add_batch_rank=True)

        # Create a set of demos.
        demo_states = agent.preprocessed_state_space.with_batch_rank().sample(
            20)
        demo_actions = actions_space.with_batch_rank().sample(20)
        demo_rewards = rewards.sample(20, fill_value=1.0)
        demo_next_states = agent.preprocessed_state_space.with_batch_rank(
        ).sample(20)
        demo_terminals = terminals.sample(20, fill_value=False)

        # Insert.
        agent.observe_demos(
            preprocessed_states=demo_states,
            actions=demo_actions,
            rewards=demo_rewards,
            next_states=demo_next_states,
            terminals=demo_terminals,
        )

        # Fit demos.
        agent.update_from_demos(num_updates=5000, batch_size=20)

        # Evaluate demos:
        agent_actions = agent.get_action(demo_states,
                                         apply_preprocessing=False,
                                         use_exploration=False)
        recursive_assert_almost_equal(agent_actions, demo_actions)
    def test_actor_component_with_lstm_network(self):
        # state space and internal state space
        state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False)
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True)
        time_percentages_space = FloatBox()
        # action_space.
        action_space = IntBox(2, add_batch_rank=True, add_time_rank=True)

        preprocessor = PreprocessorStack.from_spec(
            [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)]
        )
        policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space)
        exploration = Exploration(epsilon_spec=dict(decay_spec=dict(
            type="linear_decay", from_=1.0, to_=0.1)
        ))
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(
                states=state_space,
                other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True),
                time_percentage=time_percentages_space
            ),
            action_space=action_space
        )
        # Some state inputs (batch size=2, seq-len=1000; batch-major).
        np.random.seed(10)
        states = state_space.sample(size=(1000, 2))
        initial_internal_states = internal_states_space.zeros(size=2)  # only batch
        time_percentages = time_percentages_space.sample(1000)

        # Run n times a single time-step to simulate acting and env interaction with an LSTM.
        preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float)
        actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int)
        for i, time_percentage in enumerate(time_percentages):
            ret = test.test((
                "get_preprocessed_state_and_action",
                # expand time dim at 1st slot as we are time-major == False
                [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage]
            ))
            preprocessed_states[i] = ret["preprocessed_state"][:, 0, :]  # take out time-rank again ()
            actions[i] = ret["action"]
            # Check c/h-state shape.
            self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3))  # batch-size=2, LSTM units=3
            self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3))

        # Check all preprocessed states (easy: just divided by 10).
        expected_preprocessed_state = states / 10
        recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state)

        # Check the exploration functionality over the actions.
        # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small.
        stddev_actions = actions.std()
        self.assertGreater(stddev_actions, 0.4)
        self.assertLess(stddev_actions, 0.6)
Пример #6
0
    def check_env(self, prop, expected_value, decimals=7):
        """
        Checks a property of our environment for (almost) equality.

        Args:
            prop (str): The name of the Environment's property to check.
            expected_value (any): The expected value of the given property.
            decimals (Optional[int]): The number of digits after the floating point up to which to compare actual
                and expected values.
        """
        is_value = getattr(self.env, prop, None)
        recursive_assert_almost_equal(is_value, expected_value, decimals=decimals)
    def test_sequential_vector_env(self):
        num_envs = 4
        env = SequentialVectorEnv(num_environments=num_envs,
                                  env_spec={
                                      "type": "gridworld",
                                      "world": "2x2"
                                  })

        # Simple test runs with fixed actions.
        # X=player's position
        s = env.reset(index=0)  # ["XH", " G"]  X=player's position
        self.assertTrue(s == 0)

        s = env.reset_all()
        all(self.assertTrue(s_ == 0) for s_ in s)

        s, r, t, _ = env.step([2
                               for _ in range(num_envs)])  # down: [" H", "XG"]
        all(self.assertTrue(s_ == 1) for s_ in s)
        all(recursive_assert_almost_equal(r_, -0.1) for r_ in r)
        all(self.assertTrue(not t_) for t_ in t)

        s, r, t, _ = env.step([1 for _ in range(num_envs)
                               ])  # right: [" H", " X"]
        all(self.assertTrue(s_ == 3) for s_ in s)
        all(recursive_assert_almost_equal(r_, 1.0) for r_ in r)
        all(self.assertTrue(t_) for t_ in t)

        [env.reset(index=i)
         for i in range(num_envs)]  # ["XH", " G"]  X=player's position
        s, r, t, _ = env.step([1 for _ in range(num_envs)
                               ])  # right: [" X", " G"] -> in the hole
        all(self.assertTrue(s_ == 2) for s_ in s)
        all(self.assertTrue(r_ == -5.0) for r_ in r)
        all(self.assertTrue(t_) for t_ in t)

        # Run against a wall.
        env.reset_all()  # ["XH", " G"]  X=player's position
        s, r, t, _ = env.step([3
                               for _ in range(num_envs)])  # left: ["XH", " G"]
        all(self.assertTrue(s_ == 0) for s_ in s)
        all(recursive_assert_almost_equal(r_, -0.1) for r_ in r)
        all(self.assertTrue(not t_) for t_ in t)
        s, r, t, _ = env.step([2
                               for _ in range(num_envs)])  # down: [" H", "XG"]
        all(self.assertTrue(s_ == 1) for s_ in s)
        all(recursive_assert_almost_equal(r_, -0.1) for r_ in r)
        all(self.assertTrue(not t_) for t_ in t)
        s, r, t, _ = env.step([0 for _ in range(num_envs)])  # up: ["XH", " G"]
        all(self.assertTrue(s_ == 0) for s_ in s)
        all(recursive_assert_almost_equal(r_, -0.1) for r_ in r)
        all(self.assertTrue(not t_) for t_ in t)
    def test_double_dqn_on_2x2_grid_world(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld.
        """
        env_spec = dict(world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            dueling_q=False,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
            execution_spec=dict(seed=10),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 1000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Пример #9
0
    def test_2x2_grid_world_using_flow_methods(self):
        """
        Tests a minimalistic 2x2 GridWorld.
        """
        env = GridWorld(world="2x2")

        # Simple test runs with fixed actions.
        # X=player's position
        s, r, t = env.step_flow(2)  # down: [" H", "XG"]
        self.assertTrue(s == 1)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t = env.step_flow(1)  # right: [" H", " X"]
        self.assertTrue(s == 0)
        self.assertTrue(r == 1.0)
        self.assertTrue(t)

        s, r, t = env.step_flow(1)  # right: [" X", " G"] -> in the hole
        self.assertTrue(s == 0)
        self.assertTrue(r == -5.0)
        self.assertTrue(t)

        # Run against a wall.
        s, r, t = env.step_flow(3)  # left: ["XH", " G"]
        self.assertTrue(s == 0)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t = env.step_flow(2)  # down: [" H", "XG"]
        self.assertTrue(s == 1)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t = env.step_flow(0)  # up: ["XH", " G"]
        self.assertTrue(s == 0)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
Пример #10
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 1000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        # Marge q-tables of all four GPUs:
        agent.last_q_table["q_values"] = agent.last_q_table[
            "q_values"].reshape((48, 4))

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Пример #11
0
    def check_agent(self, prop, expected_value, decimals=7, key_or_index=None):
        """
        Checks a property of our Agent for (almost) equality.

        Args:
            prop (str): The name of the Agent's property to check.
            expected_value (any): The expected value of the given property.
            decimals (Optional[int]): The number of digits after the floating point up to which to compare actual
                and expected values.
            key_or_index (Optional[int, str]): Optional key or index into the propery in case of nested data structure.
        """
        is_value = getattr(self.agent, prop, None)
        if key_or_index is not None:
            is_value = is_value[key_or_index]
        recursive_assert_almost_equal(is_value, expected_value, decimals=decimals)
Пример #12
0
    def check_var(self, variable, expected_value, decimals=7):
        """
        Checks a value of our an Agent's variable for (almost) equality against an expected one.

        Args:
            variable (str): The global scope (within Agent's root-component) of the variable to check.
            expected_value (any): The expected value of the given variable.
            decimals (Optional[int]): The number of digits after the floating point up to which to compare actual
                and expected values.
        """
        variables_dict = self.agent.root_component.variables
        assert variable in variables_dict, "ERROR: Variable '{}' not found in Agent '{}'!".\
            format(variable, self.agent.name)
        var = variables_dict[variable]
        value = self.graph_executor.read_variable_values(var)
        recursive_assert_almost_equal(value, expected_value, decimals=decimals)
Пример #13
0
    def test_learning_2x2_grid_world(self):
        """
        Tests if apex can learn a simple environment using a single worker, thus replicating
        dqn.
        """
        env_spec = dict(type="grid-world", world="2x2", save_mode=False)
        agent_config = config_from_path(
            "configs/apex_agent_for_2x2_gridworld.json")
        # TODO remove after unified backends
        if get_backend() == "pytorch":
            agent_config["memory_spec"]["type"] = "mem_prioritized_replay"
        executor = ApexExecutor(
            environment_spec=env_spec,
            agent_config=agent_config,
        )
        # Define executor, test assembly.
        print("Successfully created executor.")

        # Executes actual workload.
        result = executor.execute_workload(
            workload=dict(num_timesteps=5000,
                          report_interval=100,
                          report_interval_min_seconds=1))
        full_worker_stats = executor.result_by_worker()
        print("All finished episode rewards")
        print(full_worker_stats["episode_rewards"])

        print("STATES:\n{}".format(
            executor.local_agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(executor.local_agent.last_q_table["q_values"],
                      decimals=2)))

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(
                executor.local_agent.last_q_table["states"],
                executor.local_agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Пример #14
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system.

        THIS TEST REQUIRES A MULTI GPU SYSTEM.
        """
        #root_logger.setLevel(DEBUG)  # test
        env = GridWorld("2x2")
        agent = DQNAgent.from_spec(
            config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"),
            dueling_q=False,
            state_space=env.state_space,
            action_space=env.action_space,
            observe_spec=dict(buffer_size=100),
            # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU.
            update_spec=dict(update_interval=4, batch_size=48, sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.15),
            store_last_q_table=True
        )

        time_steps = 400
        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 250)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
Пример #15
0
    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(update_interval=4, batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
        )

        learn_updates = 1000
        # Setup the queue runner.
        agent.call_api_method("setup_queue_runner")
        for _ in range(learn_updates):
            agent.update()

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        #self.assertEqual(results["timesteps_executed"], time_steps)
        #self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], -3.5)
        #self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        #self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Пример #16
0
    def test_weights_getting_setting(self):
        """
        Tests getting and setting of the Agent's weights.
        """
        env = GridWorld(world="2x2")
        agent = Agent.from_spec(
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        new_weights = {}
        for key, weight in weights["policy_weights"].items():
            new_weights[key] = weight + 0.01

        agent.set_weights(new_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(new_actual_weights["policy_weights"],
                                      new_weights)
    def test_policy_sync(self):
        """
        Tests weight syncing of policy (and only policy, not Q-functions).
        """
        env = OpenAIGymEnv("CartPole-v0")
        agent = SACAgent.from_spec(
            config_from_path("configs/sac_agent_for_cartpole.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        print("weights =", weights.keys())

        new_weights = {}
        for key, value in weights["policy_weights"].items():
            new_weights[key] = value + 0.01

        agent.set_weights(policy_weights=new_weights,
                          value_function_weights=None)

        updated_weights = agent.get_weights()["policy_weights"]
        recursive_assert_almost_equal(updated_weights, new_weights)
    def test_simple_variational_auto_encoder(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        input_spaces = dict(
            input_=FloatBox(shape=(3,), add_batch_rank=True), z_vector=FloatBox(shape=(1,), add_batch_rank=True)
        )

        variational_auto_encoder = VariationalAutoEncoder(
            z_units=1,
            encoder_network_spec=config_from_path("configs/test_vae_encoder_network.json"),
            decoder_network_spec=config_from_path("configs/test_vae_decoder_network.json")
        )

        # Do not seed, we calculate expectations manually.
        test = ComponentTest(component=variational_auto_encoder, input_spaces=input_spaces)

        # Batch of size=3.
        input_ = np.array([[0.1, 0.2, 0.3], [1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
        global_scope = "variational-auto-encoder/"
        # Calculate output manually.
        var_dict = test.read_variable_values(variational_auto_encoder.variable_registry)

        encoder_network_out = dense_layer(
            input_, var_dict[global_scope+"encoder-network/encoder-layer/dense/kernel"],
            var_dict[global_scope+"encoder-network/encoder-layer/dense/bias"]
        )
        expected_mean = dense_layer(
            encoder_network_out, var_dict[global_scope+"mean-layer/dense/kernel"],
            var_dict[global_scope+"mean-layer/dense/bias"]
        )
        expected_stddev = dense_layer(
            encoder_network_out, var_dict[global_scope + "stddev-layer/dense/kernel"],
            var_dict[global_scope + "stddev-layer/dense/bias"]
        )
        out = test.test(("encode", input_), expected_outputs=None)
        recursive_assert_almost_equal(out["mean"], expected_mean, decimals=5)
        recursive_assert_almost_equal(out["stddev"], np.exp(expected_stddev), decimals=5)
        self.assertTrue(out["z_sample"].shape == (3, 1))

        test.terminate()
Пример #19
0
    def test_capacity_with_episodes(self):
        """
        Tests if inserts of non-terminals work.

        Note that this does not test episode semantics itself, which are tested below.
        """
        ring_buffer = RingBuffer(capacity=self.capacity)
        test = ComponentTest(component=ring_buffer,
                             input_spaces=self.input_spaces)
        # Internal memory variables.
        ring_buffer_variables = test.get_variable_values(
            ring_buffer, self.ring_buffer_variables)
        size_value = ring_buffer_variables["size"]
        index_value = ring_buffer_variables["index"]
        num_episodes_value = ring_buffer_variables["num-episodes"]
        episode_index_values = ring_buffer_variables["episode-indices"]

        # Assert indices 0 before insert.
        self.assertEqual(size_value, 0)
        self.assertEqual(index_value, 0)
        self.assertEqual(num_episodes_value, 0)
        self.assertEqual(np.sum(episode_index_values), 0)

        # Insert one more element than capacity. Note: this is different than
        # replay test because due to episode semantics, it matters if
        # these are terminal or not. This tests if episode index updating
        # causes problems if none of the inserted elements are terminal.
        observation = non_terminal_records(self.record_space,
                                           self.capacity + 1)
        test.test(("insert_records", observation), expected_outputs=None)

        ring_buffer_variables = test.get_variable_values(
            ring_buffer, self.ring_buffer_variables)
        size_value = ring_buffer_variables["size"]
        index_value = ring_buffer_variables["index"]
        num_episodes_value = ring_buffer_variables["num-episodes"]
        episode_index_values = ring_buffer_variables["episode-indices"]

        # Size should be equivalent to capacity when full.
        self.assertEqual(size_value, self.capacity)

        # Index should be one over capacity due to modulo.
        self.assertEqual(index_value, 1)
        self.assertEqual(num_episodes_value, 0)
        self.assertEqual(np.sum(episode_index_values), 0)

        # If we fetch n elements, we expect to see exactly the last n.
        for last_n in range(1, 6):
            batch = test.test(("get_records", last_n), expected_outputs=None)
            recursive_assert_almost_equal(
                batch["actions"]["action1"],
                observation["actions"]["action1"][-last_n:])
            recursive_assert_almost_equal(
                batch["states"]["state2"],
                observation["states"]["state2"][-last_n:])
            recursive_assert_almost_equal(batch["terminals"],
                                          observation["terminals"][-last_n:])
Пример #20
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 2000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check all learnt Q-values.
        q_values = agent.graph_executor.execute(
            ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:]
        recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8),
                                      decimals=1)
        recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9),
                                      decimals=1)
Пример #21
0
    def test_random_env(self):
        """
        Tests deterministic functionality of RandomEnv.
        """
        env = RandomEnv(state_space=FloatBox(shape=(2, 2)), action_space=IntBox(2), deterministic=True)

        # Simple test runs with fixed actions.
        s = env.reset()
        recursive_assert_almost_equal(s, np.array([[0.77132064, 0.02075195], [0.63364823, 0.74880388]]))
        s, r, t, _ = env.step(env.action_space.sample())
        recursive_assert_almost_equal(s, np.array([[0.1980629, 0.7605307], [0.1691108, 0.0883398]]))
        s, r, t, _ = env.step(env.action_space.sample())
        recursive_assert_almost_equal(r, np.array(0.7217553))
        s, r, t, _ = env.step(env.action_space.sample())
        self.assertEqual(t, False)
        s, r, t, _ = env.step(env.action_space.sample())
        recursive_assert_almost_equal(s, np.array([[0.4418332, 0.434014], [0.617767 , 0.5131382]]))
        s, r, t, _ = env.step(env.action_space.sample())
    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05))

        learn_updates = 50
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(
                i, float(ret[1]), mean_return))

        # Assume we have learned something.
        self.assertGreater(mean_return, -0.1)

        # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start).
        action_probs = ret[3]["action_probs"].reshape((80, 4))
        next_states = ret[3]["states"][:, 1:].reshape((80, ))
        for s_, probs in zip(next_states, action_probs):
            # Start state:
            # - Assume we picked "right" in state=1 (in order to step into goal state).
            # - OR we picked "up" or "left" in state=0 (unlikely, but possible).
            if s_ == 0:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)
            # One below start:
            # - Assume we picked "down" in start state with very large probability.
            # - OR we picked "left" or "down" in state=1 (unlikely, but possible).
            elif s_ == 1:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)

        agent.terminate()
Пример #23
0
    def test_policy_for_bounded_continuous_action_space(self):
        """
        https://github.com/rlgraph/rlgraph/issues/43
        """
        nn_input_space = FloatBox(shape=(4, ), add_batch_rank=True)
        action_space = FloatBox(low=-1.0,
                                high=1.0,
                                shape=(1, ),
                                add_batch_rank=True)
        # Double the shape for alpha/beta params.
        # action_space_parameters = Tuple(FloatBox(shape=(1,)), FloatBox(shape=(1,)), add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=nn_input_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)

        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            nn_input,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))
        test.test(("get_nn_outputs", nn_input),
                  expected_outputs=expected_nn_output)

        # Raw action layer output.
        expected_raw_logits = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params))
        test.test(("get_adapter_outputs", nn_input),
                  expected_outputs=dict(adapter_outputs=expected_raw_logits,
                                        nn_outputs=expected_nn_output),
                  decimals=5)

        # Parameter (alpha/betas).
        expected_alpha_parameters = np.log(
            np.exp(expected_raw_logits[:, 0:1]) + 1.0) + 1.0
        expected_beta_parameters = np.log(
            np.exp(expected_raw_logits[:, 1:]) + 1.0) + 1.0
        expected_parameters = tuple(
            [expected_alpha_parameters, expected_beta_parameters])
        test.test(("get_adapter_outputs_and_parameters", nn_input,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(adapter_outputs=expected_raw_logits,
                                        parameters=expected_parameters),
                  decimals=5)

        print("Params: {}".format(expected_parameters))

        action = test.test(("get_action", nn_input))["action"]
        self.assertTrue(action.dtype == np.float32)
        self.assertGreaterEqual(action.min(), -1.0)
        self.assertLessEqual(action.max(), 1.0)
        self.assertTrue(action.shape == (3, 1))

        out = test.test(("get_action_and_log_likelihood", nn_input))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        actions_scaled_back = (action + 1.0) / 2.0
        expected_action_log_llh_output = np.log(
            beta.pdf(actions_scaled_back, expected_alpha_parameters,
                     expected_beta_parameters))
        # expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]],
        # [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]])
        test.test(("get_log_likelihood", [nn_input, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        actions = test.test(("get_stochastic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Deterministic sample.
        actions = test.test(("get_deterministic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Distribution's entropy.
        entropy = test.test(("get_entropy", nn_input))["entropy"]
        self.assertTrue(entropy.dtype == np.float32)
        self.assertTrue(entropy.shape == (3, 1))
Пример #24
0
    def test_policy_for_discrete_action_space_with_dueling_layer(self):
        # np.random.seed(10)
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        nn_input_space = FloatBox(shape=(3, ), add_batch_rank=True)

        # action_space (2 possible actions).
        action_space = IntBox(2, add_batch_rank=True)
        # flat_float_action_space = FloatBox(shape=(2,), add_batch_rank=True)

        # Policy with dueling logic.
        policy = DuelingPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_adapter_spec=dict(pre_network_spec=[
                dict(type="dense",
                     units=10,
                     activation="lrelu",
                     activation_params=[0.1])
            ]),
            units_state_value_stream=10,
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=nn_input_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                nn_input,
                ComponentTest.read_params(
                    "dueling-policy/test-network/hidden-layer",
                    policy_params)), 0.1)
        test.test(("get_nn_outputs", nn_input),
                  expected_outputs=expected_nn_output)

        # Single state values.
        expected_state_values = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output,
                    ComponentTest.read_params(
                        "dueling-policy/dense-layer-state-value-stream",
                        policy_params))),
            ComponentTest.read_params("dueling-policy/state-value-node",
                                      policy_params))
        test.test(
            ("get_state_values", nn_input, ["state_values", "nn_outputs"]),
            expected_outputs=dict(state_values=expected_state_values,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # Raw action layer output.
        expected_raw_advantages = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output,
                    ComponentTest.read_params(
                        "dueling-policy/action-adapter-0/action-network/dense-layer",
                        policy_params)), 0.1),
            ComponentTest.read_params(
                "dueling-policy/action-adapter-0/action-network/action-layer",
                policy_params))

        # Q-values: One for each item in the batch.
        expected_q_values_output = expected_state_values + expected_raw_advantages - \
            np.mean(expected_raw_advantages, axis=-1, keepdims=True)
        test.test(
            ("get_adapter_outputs", nn_input,
             ["adapter_outputs", "advantages"]),
            expected_outputs=dict(adapter_outputs=expected_q_values_output,
                                  advantages=expected_raw_advantages),
            decimals=5)

        # Parameter (probabilities). Softmaxed q_values.
        expected_parameters_output = np.maximum(
            softmax(expected_q_values_output, axis=-1), SMALL_NUMBER)
        test.test(
            ("get_adapter_outputs_and_parameters", nn_input,
             ["adapter_outputs", "parameters"]),
            expected_outputs=dict(adapter_outputs=expected_q_values_output,
                                  parameters=expected_parameters_output),
            decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_q_values_output, axis=-1)
        test.test(("get_action", nn_input, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", nn_input))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]],
                expected_parameters_output[2][action[2]],
            ]))
        test.test(("get_log_likelihood", [nn_input, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output,
                      adapter_outputs=expected_q_values_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", nn_input), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
Пример #25
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55],
                           [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER)
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters", "log_probs"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output,
                      parameters=np.array(expected_parameters_output,
                                          dtype=np.float32),
                      log_probs=np.log(expected_parameters_output)),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]]
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, ))
Пример #26
0
    def test_shared_value_function_policy_for_discrete_action_space_with_time_rank_folding(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(3, ),
                               add_batch_rank=True,
                               add_time_rank=True)

        # action_space (4 possible actions).
        action_space = IntBox(4, add_batch_rank=True, add_time_rank=True)
        flat_float_action_space = FloatBox(shape=(4, ),
                                           add_batch_rank=True,
                                           add_time_rank=True)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        # Add folding and unfolding to network.
        network_spec["fold_time_rank"] = True
        network_spec["unfold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(fold_time_rank=True,
                                     unfold_time_rank=True),
            action_space=action_space,
            value_fold_time_rank=True,
            value_unfold_time_rank=True)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 3))
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = np.reshape(relu(
            np.matmul(
                states_folded,
                ComponentTest.read_params(
                    "shared-value-function-policy/test-network/hidden-layer",
                    policy_params)), 0.1),
                                        newshape=states.shape)
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/",
                policy_params))

        expected_action_layer_output = np.reshape(expected_action_layer_output,
                                                  newshape=(2, 3, 4))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/value-function-node/dense-layer",
                policy_params))
        expected_state_value_output_unfolded = np.reshape(
            expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states, ["state_values"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded),
                  decimals=5)

        expected_action_layer_output_unfolded = np.reshape(
            expected_action_layer_output, newshape=(2, 3, 4))
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output_unfolded, axis=-1),
            SMALL_NUMBER)
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters", "nn_outputs"]),
                  expected_outputs=dict(
                      nn_outputs=expected_nn_output,
                      adapter_outputs=expected_action_layer_output_unfolded,
                      parameters=expected_parameters_output),
                  decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_action_layer_output_unfolded,
                                     axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-llh.
        expected_action_log_llh_output = np.log(
            np.array([[
                expected_parameters_output[0][0][action[0][0]],
                expected_parameters_output[0][1][action[0][1]],
                expected_parameters_output[0][2][action[0][2]],
            ],
                      [
                          expected_parameters_output[1][0][action[1][0]],
                          expected_parameters_output[1][1][action[1][1]],
                          expected_parameters_output[1][2][action[1][2]],
                      ]]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(
            out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(
            out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(
            out["entropy"].shape == (2, 3))  # Make sure output is unfolded.
Пример #27
0
    def test_shared_value_function_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(
            np.matmul(
                states,
                ComponentTest.read_params(
                    "shared-value-function-policy/test-network/hidden-layer",
                    policy_params)), 0.1)

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/",
                policy_params))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/value-function-node/dense-layer",
                policy_params))
        test.test(
            ("get_state_values", states, ["state_values"]),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # Logits-values.
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output,
                      adapter_outputs=expected_action_layer_output),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER)
        test.test(
            ("get_adapter_outputs_and_parameters", states,
             ["adapter_outputs", "parameters"]),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  parameters=expected_parameters_output),
            decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-llh.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]],
                expected_parameters_output[2][action[2]],
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output, llh)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
Пример #28
0
    def test_episode_fetching(self):
        """
        Test if we can accurately fetch most recent episodes.
        """
        for backend in (None, "python"):
            ring_buffer = RingBuffer(capacity=self.capacity, backend=backend)
            test = ComponentTest(component=ring_buffer, input_spaces=self.input_spaces)

            # Insert 2 non-terminals, 1 terminal
            observation = non_terminal_records(self.record_space, 2)
            test.test(("insert_records", observation), expected_outputs=None)
            observation = terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)

            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # One episode.
            self.assertEqual(num_episodes_value, 1)
            expected_indices = [0] * self.capacity
            expected_indices[0] = 2
            recursive_assert_almost_equal(episode_index_values, expected_indices)

            # We should now be able to retrieve one episode of length 3.
            episode = test.test(("get_episodes", 1), expected_outputs=None)
            expected_terminals = [0, 0, 1]
            recursive_assert_almost_equal(episode["terminals"], expected_terminals)

            # We should not be able to retrieve two episodes, and still return just one.
            episode = test.test(("get_episodes", 2), expected_outputs=None)
            expected_terminals = [0, 0, 1]
            recursive_assert_almost_equal(episode["terminals"], expected_terminals)

            # Insert 7 non-terminals.
            observation = non_terminal_records(self.record_space, 7)
            test.test(("insert_records", observation), expected_outputs=None)

            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            index_value = ring_buffer_variables["index"]
            episode_index_values = ring_buffer_variables["episode-indices"]

            # Episode indices should not have changed.
            expected_indices[0] = 2
            recursive_assert_almost_equal(episode_index_values, expected_indices)
            # Inserted 2 non-terminal, 1 terminal, 7 non-terminal at capacity 10 -> should be at 0 again.
            self.assertEqual(index_value, 0)

            # Now inserting one terminal so the terminal buffer has layout [1 0 1 0 0 0 0 0 0 0]
            observation = terminal_records(self.record_space, 1)
            test.test(("insert_records", observation), expected_outputs=None)

            # Episode indices:
            ring_buffer_variables = test.get_variable_values(self.ring_buffer_variables)
            num_episodes_value = ring_buffer_variables["num-episodes"]
            recursive_assert_almost_equal(num_episodes_value, 2)

            # # Check if we can fetch 2 episodes:
            episodes = test.test(("get_episodes", 2), expected_outputs=None)
            #
            # # We now expect to have retrieved:
            # # - 10 time steps
            # # - 2 terminal values 1
            # # - Terminal values spaced apart 1 index due to the insertion order
            self.assertEqual(len(episodes['terminals']), self.capacity)
            self.assertEqual(episodes['terminals'][0], True)
            self.assertEqual(episodes['terminals'][2], True)
Пример #29
0
    def test_4x4_grid_world_with_container_actions(self):
        """
        Tests a 4x4 GridWorld using forward+turn+jump container actions.
        """
        env = GridWorld(world="4x4", action_type="ftj", state_representation="xy+orientation")

        # Simple test runs with fixed actions.

        # Fall into hole.
        s = env.reset()  # [0, 0, 0] (x, y, orientation)
        recursive_assert_almost_equal(s, [0, 0, 0, 1])
        s, r, t, _ = env.step(dict(turn=2, forward=2))  # turn=2 (right), move=2 (forward), jump=0
        recursive_assert_almost_equal(s, [1, 0, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=2, forward=1))  # turn=2 (right), move=1 (stay), jump=0
        recursive_assert_almost_equal(s, [1, 0, 0, -1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=2))  # turn=1 (no turn), move=2 (forward), jump=0
        recursive_assert_almost_equal(s, [1, 1, 0, -1])
        self.assertTrue(r == -5.0)
        self.assertTrue(t)

        # Jump quite a lot and reach goal.
        env.reset()  # [0, 0, 0] (x, y, orientation)
        s, r, t, _ = env.step(dict(turn=2, forward=1))
        recursive_assert_almost_equal(s, [0, 0, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1))
        recursive_assert_almost_equal(s, [2, 0, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=2, forward=2))
        recursive_assert_almost_equal(s, [2, 1, 0, -1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=2, jump=1))
        recursive_assert_almost_equal(s, [2, 3, 0, -1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=2, forward=0))
        recursive_assert_almost_equal(s, [3, 3, -1, 0])
        self.assertTrue(r == 1.0)
        self.assertTrue(t)

        # Run against a wall.
        env.reset()  # [0, 0, 0] (x, y, orientation)
        s, r, t, _ = env.step(dict(turn=1, forward=0))
        recursive_assert_almost_equal(s, [0, 1, 0, 1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=0, forward=2))
        recursive_assert_almost_equal(s, [0, 1, -1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)

        # Jump over a hole (no reset).
        s, r, t, _ = env.step(dict(turn=2, forward=1))  # turn around
        s, r, t, _ = env.step(dict(turn=2, forward=1))
        recursive_assert_almost_equal(s, [0, 1, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1))
        recursive_assert_almost_equal(s, [2, 1, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
Пример #30
0
    def test_long_chain_grid_world(self):
        """
        Tests a minimalistic long-chain GridWorld.
        """
        env = GridWorld(world="long-chain")

        # Simple test runs with fixed actions.
        # X=player's position
        s = env.reset()  # ["X                                              G"]
        self.assertTrue(s == 33)
        s, r, t, _ = env.step(2)  # down: ["X                                              G"]
        self.assertTrue(s == 33)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(1)  # right: ["SX                                             G"]
        self.assertTrue(s == 34)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)

        env.reset()  # ["X                                              G"]
        # Right, left, down, up, right -> Move one right each iteration.
        for x in range(20):
            s, r, t, _ = env.step(1)
            self.assertTrue(s == x + 33 + 1)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(3)
            self.assertTrue(s == x + 33)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(2)
            self.assertTrue(s == x + 33)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(0)
            self.assertTrue(s == x + 33)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(1)
            self.assertTrue(s == x + 33 + 1)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)