Exemplo n.º 1
0
Q = init_list(y, x) 꼴로 사용할 수 있습니다.
"""

# cliff.py에서 cliff walking 게임 환경을 가져옵니다.
from random import random, choice
from cliff import Cliff, init_list

# 상수
size_x = 10  # cliff의 가로 길이
size_y = 5  # cliff의 세로 길이
gamma = 0.9  # discount rate
alpha = 0.001  # learning rate
i_episode = 100000  # 총 episode의 수
epsilon = 0.05  # e-greedy 상수

env = Cliff(size_x, size_y)

# value function을 초기화합니다.
# Q[y][x][a] = total reward 꼴로 저장합니다.
Q1 = init_list(size_x, size_y)
Q2 = init_list(size_x, size_y)

# state pos에서 action a를 greedy policy로 선택합니다.


def greedy(q, pos):
    max_Q = -100000000  # Q값의 max값을 저장할 변수
    max_action = -1  # Q값이 max가 되도록 하는 action

    # argmax Q(state, action)을 계산한다.
    for action in range(4):
Exemplo n.º 2
0
            a_ = agent.action(s)

        if max_e and e >= max_e:
            break

    return ep_lens, rewards


if __name__ == '__main__':
    ## Run settings
    num_runs = 10  # Number of runs to average rewards over
    n = 1  # n parameter in n-step Bootstrapping

    ## Q-learning
    TN_QLearning_rewards = []
    env = Cliff()
    for i in range(num_runs):
        # Create agent
        TN_QLearning = TabularNStepQLearning(env.state_shape,
                                             env.num_actions,
                                             n=n)

        # Run training loop
        _, rewards = run_loop(env, TN_QLearning,
                              str(n) + '-step QLearning, run: ' + str(i))
        TN_QLearning_rewards.append(rewards)
    TN_QLearning_rewards = np.array(TN_QLearning_rewards)

    # Run the last QLearning agent using visualizations.
    # Try running this a couple of times
    run_loop(env, TN_QLearning, 'QLearning, n=' + str(n), max_e=1, render=True)
Exemplo n.º 3
0
class Agent:
    cliff_70 = Cliff()
    cliff_70.reset()

    gamma = cliff_70.get_gamma()
    states = cliff_70.get_states()
    states_actions_table = cliff_70.get_actions()
    a = Cliff.a

    def __init__(self, alpha, epsilon):
        self.alpha = alpha
        self.epsilon = epsilon

        runnable = '1'

        while runnable == '1':
            to_do = input("If you want to let me begin learning, please give me 1.\n" +
                          "Or, I'll show you the map I've gotten.\n")

            q_func_70 = {}

            if to_do == '1':
                do_refresh = input("If you want to refresh the q-table, please give me 1.\n")
                do_double = input("If you want to do double Q-learning, please give me 1.\n")

                for state in Agent.states:
                    actions = Agent.states_actions_table[state]
                    for action in actions:
                        key = "%d_%d_%s" % (state[0], state[1], action)
                        q_func_70[key] = 0.0

                if do_double == '1':
                    if do_refresh == '1':
                        f_write_l = open("best_q_func_l", mode='w')
                        f_write_r = open("best_q_func_r", mode='w')
                        self.write_q_tables(f_write_l, f_write_r, q_func_70, q_func_70)
                        f_write_l.close()
                        f_write_r.close()

                    f_read_l = open("best_q_func_l", mode='r')
                    f_read_r = open("best_q_func_r", mode='r')
                    q_func_70_l = self.read_q_table(f_read_l)
                    q_func_70_r = self.read_q_table(f_read_r)
                    f_read_l.close()
                    f_read_r.close()

                    for state in Agent.states:
                        actions = Agent.states_actions_table[state]
                        for action in actions:
                            key = "%d_%d_%s" % (state[0], state[1], action)
                            q_func_70[key] = (q_func_70_l[key] + q_func_70_r[key]) / 2

                    q_func_70 = self.double_q_learning_episodes(q_func_70, alpha, epsilon, q_func_70_l, q_func_70_r)
                    """"""
                else:
                    if do_refresh == '1':
                        f_write = open("best_q_func", mode='w')
                        self.write_q_table(f_write, q_func_70)
                        f_write.close()

                    f_read = open("best_q_func", mode='r')
                    q_func_70 = self.read_q_table(f_read)
                    f_read.close()

                    q_func_70 = self.q_learning_episodes(q_func_70, alpha, epsilon)
                    """"""

            else:
                show_map = input("If you want to watch the double Q map, please give me 1.\n")
                if show_map == '1':
                    if os.path.getsize("best_q_func_0") == 0:
                        print("I do not have a double Q map.")
                    else:
                        f_read = open("best_q_func_0", mode='r')
                        q_func_70 = self.read_q_table(f_read)
                        f_read.close()

                else:
                    if os.path.getsize("best_q_func") == 0:
                        print("I do not have a Q map.")
                    else:
                        f_read = open("best_q_func", mode='r')
                        q_func_70 = self.read_q_table(f_read)
                        f_read.close()

            path_70, demo_map_70, reward_sum_70 = self.q_func_demo(q_func_70)
            print(path_70)
            print('')
            self.draw_map(demo_map_70)
            print(reward_sum_70)

            runnable = input("If you have something else need me to do, please give me 1.\n" +
                             "Or, I'll close the program.\n")

    # 读取已有的q表
    @staticmethod
    def read_q_table(f):
        best_q_func = {}

        for f_line in f:
            if len(f_line) == 0:
                continue
            else:
                best_q = f_line.split(":")
                best_q_func[best_q[0]] = float(best_q[1])
        return best_q_func

    # 写一个q表
    @staticmethod
    def write_q_table(f, q_func):
        for key_i in q_func:
            f.write(key_i + ":" + str(q_func[key_i]) + "\n")

    # 写两个q表
    def write_q_tables(self, f_l, f_r, q_func_l, q_func_r):
        self.write_q_table(f_l, q_func_l)
        self.write_q_table(f_r, q_func_r)

    # 画地图
    @staticmethod
    def draw_map(demo_map):
        for line in demo_map:
            str_line = ""
            for m in line:
                str_line += m + "  "
            print(str_line)

    # 获得某一q表在环境下的最佳轨迹及其奖励值
    def q_func_demo(self, q_func):
        path = []
        now_state = Agent.cliff_70.reset()
        reward = 0
        reward_sum = 0
        path.append([now_state[0], now_state[1]])
        print(now_state)

        demo_y, demo_x = Agent.cliff_70.get_size_of_map()
        demo_map = []
        for i in range(0, demo_y):
            demo_line = []
            for j in range(0, demo_x):
                demo_line.append(' ')
                if Cliff.rewards[i][j] == Cliff.x:
                    demo_line[j] = 'x'
            demo_map.append(demo_line)

        demo_map[now_state[0]][now_state[1]] = '0'

        while reward < Agent.a - 1:
            now_action = self.greedy(q_func, now_state)  # greedy选择当前动作action
            now_key, next_state, reward = Agent.cliff_70.step(
                now_action)  # 对环境做出当前动作action,得到当前的key,下一个状态next_state,和回报reward
            reward_sum += reward
            now_state = next_state  # 向后走一步
            path.append([now_state[0], now_state[1]])
            demo_map[now_state[0]][now_state[1]] = 'o'
            print(now_state)

        demo_map[now_state[0]][now_state[1]] = 'A'

        return path, demo_map, reward_sum

    # 计算当前q表与基准q表的误差
    @staticmethod
    def compute_error(q_func, q_func_0):
        sum_delta = 0.0
        # print(q_func)
        # print(q_func_0)
        for q_func_key in q_func:
            error = q_func[q_func_key] - q_func_0[q_func_key]

            sum_delta += error * error
        return sum_delta

    # 计算当前q表最佳路径的总奖励与基准q表最佳路径的总奖励的误差
    def compute_reward_error(self, q_func, q_func_0):
        path_, map_, reward_ = self.q_func_demo(q_func)
        path_0, map_0, reward_0 = self.q_func_demo(q_func_0)
        return reward_ - reward_0

    # 贪婪策略选择某一状态和某一Q表下的动作
    @staticmethod
    def greedy(q_func, now_state):
        act_max_i = []
        a = now_state[0]
        b = now_state[1]
        now_actions = Agent.states_actions_table[(a, b)]
        key_i = "%d_%d_%s" % (a, b, now_actions[0])
        q_max = q_func[key_i]
        len_act = len(now_actions)

        for i in range(1, len_act):  # 扫描动作空间得到最大动作值
            key_i = "%d_%d_%s" % (a, b, now_actions[i])
            if q_max < q_func[key_i]:
                q_max = q_func[key_i]

        for i in range(0, len_act):  # 扫描动作空间得到最大动作值对应的动作
            key_i = "%d_%d_%s" % (a, b, now_actions[i])
            if q_func[key_i] == q_max:
                act_max_i.append(i)
        # 返回最大动作值对应的动作中的任意一个
        return now_actions[act_max_i[random.randint(0, len(act_max_i) - 1)]]

    # epsilon贪婪策略选择某一状态和某一Q表下的动作
    @staticmethod
    def epsilon_greedy(q_func, now_state, epsilon):
        act_max_i = []
        a = now_state[0]
        b = now_state[1]
        now_actions = Agent.states_actions_table[(a, b)]
        key_i = "%d_%d_%s" % (a, b, now_actions[0])
        q_max = q_func[key_i]
        len_act = len(now_actions)

        for i in range(1, len_act):  # 扫描动作空间得到最大动作值
            key_i = "%d_%d_%s" % (a, b, now_actions[i])
            if q_max < q_func[key_i]:
                q_max = q_func[key_i]

        for i in range(0, len_act):  # 扫描动作空间得到最大动作值对应的动作
            key_i = "%d_%d_%s" % (a, b, now_actions[i])
            if q_func[key_i] == q_max:
                act_max_i.append(i)

        select = random.random()

        # 模式选择信号小于epsilon,就在全部动作里面选一个
        if select < epsilon:
            return now_actions[random.randint(0, len_act - 1)]
        # 否则,在最优动作里选一个
        else:
            return now_actions[act_max_i[random.randint(0, len(act_max_i) - 1)]]

    # 智能体在环境里跑一个episode,得到新的q表
    def q_learning_episode(self, q_func, alpha, epsilon):
        now_state = Agent.cliff_70.reset()
        reward = 0

        while reward < Agent.a - 1:
            now_action = self.epsilon_greedy(q_func, now_state, epsilon)  # epsilon_greedy选择当前动作action
            now_key, next_state, reward = Agent.cliff_70.step(
                now_action)  # 对环境做出当前动作action,得到当前的key,下一个状态next_state,和回报reward
            # print(now_state, action, reward)
            # inf = input("")

            if reward > Agent.a - 1:
                q_func[now_key] = q_func[now_key] + alpha * (reward + Agent.gamma * reward - q_func[now_key])
            else:
                action_expect = self.greedy(q_func, next_state)  # 在下一个状态next_state有由贪婪策略得到的预期动作action_expect
                key_expect = "%d_%d_%s" % (next_state[0], next_state[1], action_expect)  # 生成预期动作的key_expect

                delta_q_func = reward + Agent.gamma * q_func[key_expect] - q_func[now_key]
                q_func[now_key] = q_func[now_key] + alpha * delta_q_func
                # 更新当前状态下,当前选择动作的q表
                now_state = next_state  # 向后走一步

        return q_func

    # double-Q-learning算法下,智能体在环境里跑一个episode,得到新的q表
    def double_q_learning_episode(self, q_func, alpha, epsilon, q_func_l, q_func_r):
        now_state = Agent.cliff_70.reset()
        reward = 0

        while reward < Agent.a - 1:
            now_action = self.epsilon_greedy(q_func, now_state, epsilon)  # epsilon_greedy选择当前动作action
            now_key, next_state, reward = Agent.cliff_70.step(
                now_action)  # 对环境做出当前动作action,得到当前的key,下一个状态next_state,和回报reward
            # print(now_state, action, reward)
            # inf = input("")

            sel = random.randint(0, 1)

            if sel == 0:
                q_func_master = q_func_l
                q_func_slave = q_func_r
            else:
                q_func_master = q_func_r
                q_func_slave = q_func_l

            if reward > Agent.a - 1:
                q_func_master[now_key] = q_func_master[now_key] + alpha * (
                        reward + Agent.gamma * reward - q_func_master[now_key])
            else:
                action_expect = self.greedy(q_func_master, next_state)  # 在下一个状态next_state有由贪婪策略得到的预期动作action_expect
                key_expect = "%d_%d_%s" % (next_state[0], next_state[1], action_expect)  # 生成预期动作的key_expect

                q_func_master[now_key] = q_func_master[now_key] + alpha * (
                        reward + Agent.gamma * q_func_slave[key_expect] - q_func_master[now_key])
                # 更新当前状态下,当前选择动作的q表

                now_state = next_state  # 向后走一步

            q_func[now_key] = (q_func_l[now_key] + q_func_r[now_key]) / 2

        return q_func, q_func_l, q_func_r

    # 智能体跑多个episode,迭代更新q表
    def q_learning_episodes(self, q_func_0, alpha, epsilon):
        i = 0
        delta = []
        for k in range(0, 100):
            delta.append(1)
        sum_delta = 100.0

        # for j in range(0, 10000):
        while sum_delta != 0.0:
            i += 1
            # print(q_func_0)
            q_func = copy.deepcopy(q_func_0)
            q_func = self.q_learning_episode(q_func, alpha, epsilon)
            # print(q_func)
            now_delta = self.compute_error(q_func, q_func_0)
            q_func_0 = q_func

            delta.append(now_delta)
            del delta[0]

            sum_delta = 0.0
            for k in range(0, 100):
                sum_delta += delta[k]

            print(i, now_delta, sum_delta)

        f_write = open("best_q_func", mode='w')
        self.write_q_table(f_write, q_func_0)
        f_write.close()

        return q_func_0

    # double-Q-learning算法下,智能体跑多个episode,迭代更新q表
    def double_q_learning_episodes(self, q_func_0, alpha, epsilon, q_func_l_0, q_func_r_0):
        i = 0
        delta = []
        for k in range(0, 100):
            delta.append(1)
        sum_delta = 100.0

        for j in range(0, 10000):
        # while sum_delta != 0.0:
            i += 1
            # print(q_func_0)
            q_func = copy.deepcopy(q_func_0)
            q_func, q_func_l_0, q_func_r_0 = \
                self.double_q_learning_episode(q_func, alpha, epsilon, q_func_l_0, q_func_r_0)
            # print(q_func)
            now_delta = self.compute_error(q_func, q_func_0)
            q_func_0 = q_func

            delta.append(now_delta)
            del delta[0]

            sum_delta = 0.0
            for k in range(0, 100):
                sum_delta += delta[k]

            print(i, now_delta, sum_delta)

        f_write_l = open("best_q_func_l", mode='w')
        f_write_r = open("best_q_func_r", mode='w')
        self.write_q_tables(f_write_l, f_write_r, q_func_l_0, q_func_r_0)
        f_write_l.close()
        f_write_r.close()

        f_write = open("best_q_func_0", mode='w')
        self.write_q_table(f_write, q_func_0)
        f_write.close()

        return q_func_0