예제 #1
0
    def test_apex_weight_syncing(self):
        env = RandomEnv(state_space=spaces.IntBox(2),
                        action_space=spaces.IntBox(2),
                        deterministic=True)

        agent = Agent.from_spec(
            config_from_path("configs/apex_agent_for_random_env.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        policy_weights = agent.get_policy_weights()
        print('policy weights: {}'.format(policy_weights))

        for variable, weights in policy_weights.items():
            weights += 0.01
        agent.set_policy_weights(policy_weights)

        new_weights = agent.get_policy_weights()
        recursive_assert_almost_equal(policy_weights, new_weights)
예제 #2
0
    def __init__(self,
                 state_start=0.0,
                 reward_start=-100.0,
                 steps_to_terminal=10):
        """
        Args:
            state_start (float): State to start with after reset.
            reward_start (float): Reward to start with (after first action) after a reset.
            steps_to_terminal (int): Number of steps after which a terminal signal is raised.
        """
        super(DeterministicEnv, self).__init__(state_space=spaces.FloatBox(),
                                               action_space=spaces.IntBox(2))

        self.state_start = state_start
        self.reward_start = reward_start
        self.steps_to_terminal = steps_to_terminal

        self.state = state_start
        self.reward = reward_start
        self.steps_into_episode = 0
예제 #3
0
    def __init__(self,
                 world="4x4",
                 save_mode=False,
                 reward_function="sparse",
                 state_representation="discr"):
        """
        Args:
            world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows
                of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state).

            save_mode (bool): Whether to replace holes (H) with walls (W). Default: False.

            reward_function (str): One of
                sparse: hole=-1, fire=-1, goal=50, all other steps=-1
                rich: hole=-100, fire=-10, goal=50

            state_representation (str): One of "discr_pos", "xy_pos", "cam"
        """
        # Build our map.
        if isinstance(world, str):
            self.description = world
            world = self.MAPS[world]
        else:
            self.description = "custom-map"

        world = np.array(list(map(list, world)))
        # Apply safety switch.
        world[world == 'H'] = ("H" if not save_mode else "F")

        # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column).
        self.world = world
        self.n_row, self.n_col = self.world.shape
        (start_x, ), (start_y, ) = np.nonzero(self.world == "S")

        # Figure out our state space.
        assert state_representation in ["discr", "xy", "cam"]
        self.state_representation = state_representation
        # Discrete states (single int from 0 to n).
        if self.state_representation == "discr":
            state_space = spaces.IntBox(self.n_row * self.n_col)
        # x/y position (2 ints).
        elif self.state_representation == "xy_pos":
            state_space = spaces.IntBox(low=(0, 0),
                                        high=(self.n_col, self.n_row),
                                        shape=(2, ))
        # Camera outputting a 2D color image of the world.
        else:
            state_space = spaces.IntBox(0,
                                        255,
                                        shape=(self.n_row, self.n_col, 3))

        self.default_start_pos = self.get_discrete_pos(start_x, start_y)
        self.discrete_pos = self.default_start_pos

        assert reward_function in ["sparse",
                                   "rich"]  # TODO: "potential"-based reward
        self.reward_function = reward_function

        # Store the goal position for proximity calculations (for "potential" reward function).
        (self.goal_x, ), (self.goal_y, ) = np.nonzero(self.world == "G")

        # Call the super's constructor.
        super(GridWorld, self).__init__(state_space=state_space,
                                        action_space=spaces.IntBox(4))

        # Reset ourselves.
        self.state = None
        self.camera_pixels = None  # only used, if state_representation=='cam'
        self.reward = None
        self.is_terminal = None
        self.reset(randomize=False)
예제 #4
0
    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            discount=0.95)
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld(world="2x2", save_mode=True),
            agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python",
                                    double_q=True,
                                    discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(loss_per_item=[
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.IntBox(4, add_batch_rank=True),
            spaces.FloatBox(add_batch_rank=True),
            spaces.BoolBox(add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True)
        ]),
                                      action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]],
                         key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states",
                       np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/actions",
            np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0] * 4 +  # + [-3.0] +
                     [0.0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] * 2 + [False] *
                     (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0],
                                               [1.0, 0.0, 0.0, 0.0]]),
                              actions=np.array([0, 1]),
                              rewards=np.array([-1.0, -3.0]),
                              terminals=np.array([False, True]),
                              next_states=np.array([[1.0, 0.0, 0.0, 0.0],
                                                    [0.0, 0.0, 0.0, 0.0]]))
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet,
                                                 matrix1_target_net,
                                                 matrix2_target_net, agent,
                                                 loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False, True] * 2 + [True, False]))
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index",
                       1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=4)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])
            # TODO: <- This is wrong and must be fixed
            # (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent,
                                                 loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=2)  # again: old matrix
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            mat_updated[1],
            decimals=2)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=2)
예제 #5
0
    def __init__(self, world="4x4", save_mode=False, action_type="udlr",
                 reward_function="sparse", state_representation="discrete"):
        """
        Args:
            world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows
                of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state).

            save_mode (bool): Whether to replace holes (H) with walls (W). Default: False.

            action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a
                discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete
                action space.

            reward_function (str): One of
                sparse: hole=-1, fire=-1, goal=50, all other steps=-1
                rich: hole=-100, fire=-10, goal=50

            state_representation (str):
                - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one
                    below, etc..
                - "xy": The x and y grid position tuple.
                - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values
                    of the actor.
                - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are
                    used to indicate different items in the scene (walls, holes, the actor, etc..).
        """
        # Build our map.
        if isinstance(world, str):
            self.description = world
            world = self.MAPS[world]
        else:
            self.description = "custom-map"

        world = np.array(list(map(list, world)))
        # Apply safety switch.
        world[world == 'H'] = ("H" if not save_mode else "F")

        # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column).
        self.world = world
        self.n_row, self.n_col = self.world.shape
        (start_x,), (start_y,) = np.nonzero(self.world == "S")

        # Figure out our state space.
        assert state_representation in ["discrete", "xy", "xy+orientation", "camera"]
        self.state_representation = state_representation
        # Discrete states (single int from 0 to n).
        if self.state_representation == "discrete":
            state_space = spaces.IntBox(self.n_row * self.n_col)
        # x/y position (2 ints).
        elif self.state_representation == "xy":
            state_space = spaces.IntBox(low=(0, 0), high=(self.n_col, self.n_row), shape=(2,))
        # x/y position + orientation (3 ints).
        elif self.state_representation == "xy+orientation":
            state_space = spaces.IntBox(low=(0, 0, 0, 0), high=(self.n_col, self.n_row, 1, 1))
        # Camera outputting a 2D color image of the world.
        else:
            state_space = spaces.IntBox(0, 255, shape=(self.n_row, self.n_col, 3))

        self.default_start_pos = self.get_discrete_pos(start_x, start_y)
        self.discrete_pos = self.default_start_pos

        assert reward_function in ["sparse", "rich"]  # TODO: "potential"-based reward
        self.reward_function = reward_function

        # Store the goal position for proximity calculations (for "potential" reward function).
        (self.goal_x,), (self.goal_y,) = np.nonzero(self.world == "G")

        # Specify the actual action spaces.
        self.action_type = action_type
        action_space = spaces.IntBox(4) if self.action_type == "udlr" else spaces.Dict(dict(
            forward=spaces.IntBox(3), turn=spaces.IntBox(3), jump=spaces.IntBox(2)
        ))

        # Call the super's constructor.
        super(GridWorld, self).__init__(state_space=state_space, action_space=action_space)

        # Reset ourselves.
        self.state = None
        self.orientation = None  # int: 0, 90, 180, 270
        self.camera_pixels = None  # only used, if state_representation=='cam'
        self.reward = None
        self.is_terminal = None
        self.reset(randomize=False)