Python SingleThreadedWorker示例，rlgraph.execution.single_threaded_worker.SingleThreadedWorker Python示例

示例#1

0

显示文件

    def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_ppo_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = PPOAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 10000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        # Assume we have learned something.
        # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world.
        self.assertGreater(results["mean_episode_reward"], -1.0)

示例#2

0

显示文件

    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 1000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        # Marge q-tables of all four GPUs:
        agent.last_q_table["q_values"] = agent.last_q_table[
            "q_values"].reshape((48, 4))

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)

示例#3

0

显示文件

    def test_episodes(self):
        """
        Simply tests if episode execution loop works and returns a result.
        """
        agent = RandomAgent(action_space=self.environment.action_space,
                            state_space=self.environment.state_space)
        worker = SingleThreadedWorker(env_spec=lambda: self.environment,
                                      agent=agent,
                                      frameskip=1,
                                      worker_executes_preprocessing=False)

        result = worker.execute_episodes(5, max_timesteps_per_episode=10)
        # Max 5 * 10.
        self.assertLessEqual(result['timesteps_executed'], 50)
        self.assertEqual(result['episodes_executed'], 5)
        self.assertLessEqual(result['env_frames'], 50)
        self.assertGreaterEqual(result['runtime'], 0.0)

示例#4

0

显示文件

    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system.

        THIS TEST REQUIRES A MULTI GPU SYSTEM.
        """
        #root_logger.setLevel(DEBUG)  # test
        env = GridWorld("2x2")
        agent = DQNAgent.from_spec(
            config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"),
            dueling_q=False,
            state_space=env.state_space,
            action_space=env.action_space,
            observe_spec=dict(buffer_size=100),
            # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU.
            update_spec=dict(update_interval=4, batch_size=48, sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.15),
            store_last_q_table=True
        )

        time_steps = 400
        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 250)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)

示例#5

0

显示文件

文件： test_ppo_agent_functionality.py 项目： EmpereurCC/RLgraph_exp

    def test_ppo_on_container_state_and_action_spaces_and_very_large_rewards(
            self):
        """
        Tests stability of PPO on an extreme env producing strange container states and large rewards and requiring
        container actions.
        """
        env = RandomEnv(
            state_space=Dict(
                {"F_position": FloatBox(shape=(2, ), low=0.01, high=0.02)}),
            action_space=Dict({
                "F_direction_low-1.0_high1.0":
                FloatBox(shape=(), low=-1.0, high=1.0),
                "F_forward_direction_low-1.0_high1.0":
                FloatBox(shape=(), low=-1.0, high=1.0),
                "B_jump":
                BoolBox()
            }),
            reward_space=FloatBox(low=-1000.0,
                                  high=-100000.0),  # hugely negative rewards
            terminal_prob=0.0000001)

        agent_config = config_from_path(
            "configs/ppo_agent_for_random_env_with_container_spaces.json")
        agent = PPOAgent.from_spec(agent_config,
                                   state_space=env.state_space,
                                   action_space=env.action_space)

        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            preprocessing_spec=None,
            worker_executes_preprocessing=True,
            #episode_finish_callback=lambda episode_return, duration, timesteps, env_num:
            #print("episode return {}; steps={}".format(episode_return, timesteps))
        )
        results = worker.execute_timesteps(num_timesteps=int(1e6),
                                           use_exploration=True)

        print(results)

示例#6

0

显示文件

    def test_prioritized_replay_atari_throughput(self):
        """
        Tests throughput on standard Atari environments using the prioritized replay memory.
        """
        agent = DQNAgent(
            states_spec=self.env.state_space,
            action_spec=self.env.action_space,
            network_spec=self.network,
            memory_spec=dict(
                type='prioritized',
                capacity=100000,
                next_states=True
            )
        )
        worker = SingleThreadedWorker(
            env_spec=lambda: self.env,
            agent=agent,
            frameskip=1
        )

        result = worker.execute_timesteps(num_timesteps=1000000, use_exploration=True)
        print('Agent throughput = {} ops/s'.format(result['ops_per_second']))
        print('Environment throughput = {} frames/s'.format(result['env_frames_per_second']))

示例#7

0

显示文件

    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 2000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check all learnt Q-values.
        q_values = agent.graph_executor.execute(
            ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:]
        recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8),
                                      decimals=1)
        recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9),
                                      decimals=1)

示例#8

0

显示文件

    def run_experiment(self, environment, experiment_num=0):
        environment = RLgraphEnvironmentWrapper(environment)
        environment.add_episode_end_callback(self.episode_finished,
                                             environment,
                                             runner_id=1)

        config = copy(self.config)

        max_episodes = config.pop('max_episodes', None)
        max_timesteps = config.pop('max_timesteps', None)
        max_episode_timesteps = config.pop('max_episode_timesteps')

        agent = Agent.from_spec(
            spec=config,
            state_space=environment.state_space,
            action_space=environment.action_space,
        )

        if experiment_num == 0 and self.load_model_file:
            logging.info("Loading model data from file: {}".format(
                self.load_model))
            agent.load_model(self.load_model_file)

        runner = SingleThreadedWorker(agent=agent, environment=environment)

        environment.reset()
        agent.reset_buffers()

        if max_timesteps:
            runner.execute_timesteps(
                num_timesteps=max_timesteps,
                max_timesteps_per_episode=max_episode_timesteps)
        else:
            runner.execute_episodes(
                num_episodes=max_episodes,
                max_timesteps_per_episode=max_episode_timesteps)

        return dict(initial_reset_time=0,
                    episode_rewards=runner.episode_rewards,
                    episode_timesteps=runner.episode_steps,
                    episode_end_times=runner.episode_durations)

示例#9

0

显示文件

    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            discount=0.95)
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld(world="2x2", save_mode=True),
            agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python",
                                    double_q=True,
                                    discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(loss_per_item=[
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.IntBox(4, add_batch_rank=True),
            spaces.FloatBox(add_batch_rank=True),
            spaces.BoolBox(add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True)
        ]),
                                      action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]],
                         key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states",
                       np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/actions",
            np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0] * 4 +  # + [-3.0] +
                     [0.0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] * 2 + [False] *
                     (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0],
                                               [1.0, 0.0, 0.0, 0.0]]),
                              actions=np.array([0, 1]),
                              rewards=np.array([-1.0, -3.0]),
                              terminals=np.array([False, True]),
                              next_states=np.array([[1.0, 0.0, 0.0, 0.0],
                                                    [0.0, 0.0, 0.0, 0.0]]))
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet,
                                                 matrix1_target_net,
                                                 matrix2_target_net, agent,
                                                 loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False, True] * 2 + [True, False]))
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index",
                       1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=4)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])
            # TODO: <- This is wrong and must be fixed
            # (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent,
                                                 loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=2)  # again: old matrix
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            mat_updated[1],
            decimals=2)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=2)