Пример #1
0
class DQNSolution():
    def __init__(self, gamma=0.9, lr=0.001):
        # hyperparameters
        self.lr = lr
        self.gamma = gamma

        # create environment
        self.env = Env((8, 8), (160, 90), default_rewards=0)
        #self.layout()

        # create DQN
        self.dqn = DQN(3,
                       self.env.action_space.n_actions, [64, 32],
                       lr,
                       gamma,
                       experience_limit=1000)

    def layout(self):
        self.env.add_item('yellow_star', (3, 3),
                          credit=100,
                          pickable=True,
                          label=1)
        self.env.add_item('yellow_star', (0, 7),
                          credit=100,
                          pickable=True,
                          label=2)
        self.env.add_item('yellow_star', (7, 0),
                          credit=100,
                          pickable=True,
                          label=3)
        self.env.add_item('yellow_star', (6, 2),
                          credit=100,
                          pickable=True,
                          label=4)
        self.env.add_item('yellow_star', (7, 5),
                          credit=100,
                          pickable=True,
                          label=5)
        self.env.add_item('red_ball', (5, 6), terminal=True, label="Exit")

    def _centralize_range(self, start, stop=0):
        if stop == 0:
            stop = start
            start = 0
        assert stop >= start
        len = stop - start + 1
        middle = (len - 1) / 2
        start = -middle
        stop = middle
        return start, stop

    def _regularize_range(self, start, stop=0):
        begin, end = self._centralize_range(start, stop)
        half = (end - begin) / 2
        result = [i / half for i in np.arange(begin, end + 1)]
        return result

    def _regularize_location(self, loc):
        r, c = loc
        height, width = (self.env.map.n_rows, self.env.map.n_columns)
        rows = self._regularize_range(height - 1)
        cols = self._regularize_range(width - 1)
        return rows[r], cols[c]

    def _compose_state(self, loc, encode):
        agent_row, agent_col = self._regularize_location(loc)
        return [agent_row, agent_col, encode]

    def state(self):
        map_size = (self.env.map.n_rows, self.env.map.n_columns)
        encode = 0
        for item in self.env.agent.bag_of_objects:
            assert item.label > 0  # `label` must start from 1
            encode += pow(2, item.label - 1)

        state = self._compose_state(self.env.agent.at, encode)
        return state

    def show_agent_values(self):
        self._show_action_values_from_loc_state(self.env.agent.at,
                                                self.state())

    def _show_action_values_from_loc_state(self, loc, state):
        state = np.array(state)
        matrix_form = state.reshape((1, *state.shape))
        action_values = self.dqn.action_values(matrix_form)[0]

        text_dict = {}
        for action_id, value in enumerate(action_values):
            action = self.env.action_space.action_from_id(action_id)
            value = np.round(value, 2)
            text_dict[action] = str(value)

        self.env.draw_values(loc, text_dict)

    def show_all_action_values(self, items_encode):
        self.env.show = True
        rows, cols = (self.env.map.n_rows, self.env.map.n_columns)
        locations = [(row, col) for row in range(rows) for col in range(cols)]
        for loc in locations:
            state = self._compose_state(loc, items_encode)
            self._show_action_values_from_loc_state(loc, state)

    def _run_an_episode(self, episode, sub_ep, truncate, show, delay):
        env = self.env  # just for convenience
        env.reset()
        env.show = show

        initial_stars = self.env.let_agent_random_pickup()
        total_stars = len(env.pickable_items())

        # get initial state and location
        state = self.state()
        location = env.agent.at
        next_location = location

        this_episode = []
        hit_walls = 0
        total_rewards = 0
        total_losses = 0
        end = False
        while end == False:
            # show agent's action-values
            if show:
                self.show_agent_values()

            action_id = self.dqn.next_action(state, episode)
            action = env.action_space.action_from_id(action_id)
            reward, next_location, end, _ = env.step(action)
            if show:
                time.sleep(delay)
            if end and len(env.pickable_items()) == 0:
                reward = 100

            total_rewards += reward

            next_state = self.state()
            one_step = (state, action_id, reward, next_state, end)
            this_episode.append(one_step)

            if location == next_location:
                hit_walls += 1

            if truncate is not None and env.steps >= truncate:
                break

            # debug info
            #print("state:", state)
            #print("common state:", common_state(state))
            #print("step {}: {} ----> {} {} reward {}\n".format(env.steps, location, next_location, action, reward))
            #print(next_state)
            state = next_state
            location = next_location

        # train a whole episode
        self.dqn.fill_experience(this_episode)
        memory = self.dqn.experience.sample(20)
        loss = self.dqn.train_multi_episodes(memory)
        total_losses += loss

        final_stars = len(env.agent.bag_of_objects)
        bingo = ""
        if end:
            if initial_stars == 0 and final_stars == total_stars:
                bingo = ", bingo!"

        #print("items remain in map:", len(env.pickable_items()))
        print(
            "# {}-{}: loss is {:.4f}, steps/hit walls: {}/{}, stars: {} --> {}, total reward: {}{}"
            .format(episode, sub_ep, loss, env.steps, hit_walls, initial_stars,
                    final_stars, total_rewards, bingo))
        #env.show = True
        #show_all_state2(env, 0)

    def train(self, n_episodes, truncate=None, show=False, delay=0):
        env = self.env  # just for convenience
        env.reset()
        total_stars = len(env.pickable_items())
        for episode in range(1, n_episodes + 1):
            #random_per_ep = pow(2, total_stars) // 4
            random_per_ep = total_stars + 1
            for sub_ep in range(random_per_ep):
                self._run_an_episode(episode, sub_ep, truncate, show, delay)

    def test(self, action_func, delay=1):
        env = self.env
        env.reset()
        env.show = True
        end = False
        while end == False:
            # debug
            self.show_agent_values()

            state = self.state()
            action_id = action_func(state)
            action = self.env.action_space.action_from_id(action_id)

            reward, next_location, end, _ = env.step(action)
            time.sleep(delay)