Пример #1
0
def calTime(filename):
    """
    使用Q表来计算完成所有任务消耗的时间
    :param filename:
    :return:
    """
    task = createTask()
    env = Maze(task)
    RL = QLearningTable(actions=list(range(env.n_actions)), filename=filename)
    Time1 = []
    # Time2 = []
    for i in range(10000):
        observation = env.reset()
        while True:
            action = RL.choose_action_real(str(observation))
            observation_, reward, done = env.step(action)
            # print(observation,action,reward)
            observation = observation_
            if done:
                time1 = findmax(task)
                # time2 = calOmegaT(task,np.array([255])[0])
                Time1.append(time1)
                # Time2.append(time2)
                break
    # print(np.mean(Time1))
    # print(np.mean(Time2))
    return np.mean(Time1)
Пример #2
0
def main():
    trained_number = getLastExperiment('p5i3g0')
    RL = QLearningTable(list(range(len(green_states))))
    trained_path = '{}/results/{}/'.format(WORKSPACE, trained_number)
    qtable_path = trained_path + 'qtable.csv'
    RL.feedQTable(qtable_path)
    RL.epsilon = 1
    fixed,rl,actuated = testAgent('fixed', RL), testAgent('rl', RL), testAgent('actuated', RL)
    plotTestResult(rl, fixed, actuated, trained_path)
Пример #3
0
def main():
    # --------------preparation--------------------
    rst_path, sim_path = generatePath(
        current_time)  # Create a new folder for the experiment
    RL = QLearningTable(list(range(
        len(green_states))))  # Initialize the Q-learning framework
    feed_path = '{}/results/{}/qtable.csv'.format(WORKSPACE, 'p5i3g0')
    RL.feedQTable(
        feed_path
    )  # This could be helpful when inheriting from previous trained agent
    # ---------------training--------------------
    trainAgent(RL, rst_path, sim_path)
    # --------------testing--------------------
    RL.epsilon = 1  # Epsilon-greedy no longer selects random actions
    fixed, rl, actuated = testAgent('fixed', RL), testAgent('rl',
                                                            RL), testAgent(
                                                                'actuated', RL)
    plotTestResult(rl, fixed, actuated, sim_path)
    flow_scenarios = ['-50%', '-25%', '0%', '+25%', '+50%']
    pushAgent(flow_scenarios, sim_path,
              RL)  # Explore the limit of the trained agent
    # --------------results----------------------
    RL.saveQTable('{}/qtable.csv'.format(sim_path))
    RL.plotCumulativeReward(sim_path)  # Plot the cumulative reward
    RL_params = {
        'lr': RL.alpha,
        'gamma': RL.gamma,
        'e_max': RL.e_greedy_max,
        'e_inc': RL.e_greedy_increment
    }
    writeLog(RL_params, rst_path, sim_path,
             clean=True)  # Record some basic information of the experiment
    # --------------end--------------------
    print('\nALL DONE, check {}'.format(str(current_time)))
Пример #4
0
def stacking_assign_q_learning(shorter_init, longer_init):
    env = Stacking(shorter_init, longer_init)
    RL = QLearningTable(actions=list(range(6)), e_greedy=1)
    if shorter_init[0] == 'A' and longer_init[0] == 'U':
        RL.q_table = RL.q_table.append(q_table_A_U)
    elif shorter_init[0] == 'C' and longer_init[0] == 'G':
        RL.q_table = RL.q_table.append(q_table_C_G)
    elif shorter_init[0] == 'G' and longer_init[0] == 'C':
        RL.q_table = RL.q_table.append(q_table_G_C)
    elif shorter_init[0] == 'G' and longer_init[0] == 'U':
        RL.q_table = RL.q_table.append(q_table_G_U)
    elif shorter_init[0] == 'U' and longer_init[0] == 'A':
        RL.q_table = RL.q_table.append(q_table_U_A)
    elif shorter_init[0] == 'U' and longer_init[0] == 'G':
        RL.q_table = RL.q_table.append(q_table_U_G)

    observation = env.shorter + "_" + env.longer
    while True:
        action = RL.choose_action(observation)
        shorter_, longer_, reward, done = env.step(action)
        observation_ = shorter_ + "_" + longer_
        # RL.learn(str(observation), action, reward, str(observation_))
        observation = observation_
        if done:
            break
    shorter_final = observation.split('_')[0]
    longer_final = observation.split('_')[1]
    return shorter_final, longer_final
Пример #5
0
def ubp_4_assign_q_learning(shorter_init):
    env = ubp_4(shorter_init)
    RL = QLearningTable(actions=list(range(4)), e_greedy=1)
    RL.q_table = RL.q_table.append(q_table_ubp_4)
    observation = env.shorter
    while True:
        action = RL.choose_action(observation)
        shorter_, reward, done = env.step(action)
        observation_ = shorter_
        # RL.learn(str(observation), action, reward, str(observation_))
        observation = observation_
        if done:
            break
    return observation
    def rl(self):
        RL = QLearningTable(actions=list(range(3)),
                            learning_rate=self.learning_rate,
                            reward_decay=self.reward_decay,
                            e_greedy=self.e_greedy)
        RL = self.train(self.D[:self.N], self.P[:self.N], RL)
        level = np.array([10, 10000, 10000])
        n_interval = int(self.T / self.I)
        cost_rl = np.zeros(n_interval)
        for n in range(1, n_interval + 1):
            a_real = self.D[self.N + self.V + (n - 1) * self.I:self.N +
                            self.V + n * self.I]
            r_real = self.R[self.N + self.V + (n - 1) * self.I:self.N +
                            self.V + n * self.I]
            p_real = self.P[self.N + self.V + (n - 1) * self.I:self.N +
                            self.V + n * self.I]
            d_real = (a_real - r_real)

            d_real = d_real.astype(int)

            d_real = d_real.reshape(len(d_real))
            p_real = p_real.reshape(len(p_real))

            s = 0
            step = 0
            pbar = p_real[0]
            observation = np.array([p_real[0], d_real[0], s])
            while True:
                temp_ob = observation.copy() / level
                temp_ob = temp_ob.astype(int)
                action = RL.choose_action(str(temp_ob))
                observation_, reward, done, pbar, stepcost, sl, cd, dd, gd = self.stepto(
                    action, observation, step, p_real, pbar, d_real)
                if observation_ == 'terminal':
                    cost_rl[n - 1] = cost_rl[n - 1] + stepcost
                    break
                else:
                    cost_rl[n - 1] = cost_rl[n - 1] + stepcost
                observation = observation_
                step = step + 1
                if step >= self.I:
                    break
        cost_rl_copy = cost_rl.copy()
        for i in range(len(cost_rl_copy)):
            cost_rl[i] = sum(cost_rl_copy[:i + 1])
        return cost_rl
Пример #7
0
    def pathplanning(self):
        global root
        global view
        RL = QLearningTable(actions=list(range(self.n_actions)),
                            learning_rate=self._learningrate,
                            reward_decay=self._discountfactor,
                            e_greedy=self._egreedy)
        # update qtable
        self.currentqtable = str(RL.q_table)
        for episode in range(self._maxepisode):
            # update episode
            self.currentepisode = episode + 1

            # reset
            self._robot = self._start.copy()
            # initialize observation
            observation = str(self._robot)
            time.sleep(1)

            while True:
                # record the final path
                if (episode == self._maxepisode - 1):
                    self.finalpath.append(
                        str("(" + str(int(self._robot[0])) + "," +
                            str(int(self._robot[1])) + ")"))

                # choose action
                action = RL.choose_action(observation)
                # get new observation
                next_observation, reward, done = self.step(action)
                # learn from this observation
                RL.learn(observation, action, reward, next_observation)
                # update observation
                observation = next_observation

                # update qtable
                self.currentqtable = str(RL.q_table)
                # sleep for qml's update
                time.sleep(0.2)
                # print("#######")
                if done:
                    break
            # print(self.finalpath)
        self.isfinalpath = True
Пример #8
0
def main():
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))

    for episode in range(100):
        if episode % 200 == 0:
            RL.save_q_table()
        # initial observation
        observation = env.reset()
        counter = 0

        while True:
            # fresh env
            env.render()
            print("Round: " + str(counter))

            # RL choose action based on observation
            action = RL.choose_action(observation)

            # RL take action and get next observation & reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(observation, action, reward, observation_, done)

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                # RL.save_q_table()
                break
            else:
                time.sleep(1)
                counter += 1

        # end game
        print("end game")

    # save q_table
    RL.save_q_table()
Пример #9
0
            y1.append(item[1])
            y2.append(item[2])
        plt.subplots()
        plt.title(key + " " + str(episode))
        plt.plot(x, y1, label="max")
        plt.plot(x, y2, label="opt")
        plt.legend()
        plt.savefig(dir + "/" + str(episode) + "/" + key + " " + str(episode) +
                    ".png")
        # plt.show()
        plt.close()


task_num = [
    3,
]
for taskNum in task_num:
    parameter["taskNum"] = taskNum
    from task import *
    task = createTask()

    # Q-learning
    env = Maze(task)
    RL = QLearningTable(
        actions=list(range(env.n_actions)),
        filename=
        "/home/zongwangz/PycharmProjects/q_learning/Figure1/Q_learning Table100_3"
    )
    update(env, RL, 1100000)
    RL.q_table.to_csv("Q_learning Table" + str(taskNum) + "_right")
Пример #10
0
    # end of game


#    print(RL.q_table)
    new_table = pd.DataFrame(dtype=np.float64)

    for i in RL.q_table._stat_axis.values.tolist():
        temp_list = i[1:-1].split(',')
        if (len(temp_list) >= 4):
            ycor = (
                (float(temp_list[1]) + float(temp_list[3])) / 2 - 20) / 40 + 1
            xcor = (
                (float(temp_list[0]) + float(temp_list[2])) / 2 - 20) / 40 + 1
            new_table = new_table.append(
                pd.Series(
                    RL.q_table.loc[i, :],
                    index=RL.q_table.columns,
                    name='({},{})'.format(int(xcor), int(ycor)),
                ))
    print(new_table)
    print('game over')
    env.destroy()

if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable()
    #100ms调用update一次
    env.after(1, update)
    env.mainloop()
            if done:
                break
    print(state_max, state_min)
    return state_max, state_min



def discretize_state(state):
    discrete_num = 10
    state_dim = env.observation_space.shape[0]
    dis_state = np.ones(state_dim)
    for i in range(state_dim):
        state_range = env.observation_space.high[i] - env.observation_space.low[i]
        # check if range is inf 
        if state_range > 1000000:
            if state[i] > state_max[i]:
                dis_state[i] = int((state_max[i]-state_min[i])/((state_max[i]-state_min[i])/discrete_num))
            if state[i] < state_min[i]:
                dis_state[i] = int((state_max[i]-state_min[i])/((state_max[i]-state_min[i])/discrete_num))
            dis_state[i] = int((state[i]-state_min[i])/((state_max[i]-state_min[i])/discrete_num))
        else:
            dis_state[i] = int((state[i]-env.observation_space.low[i])/((env.observation_space.high[i]-env.observation_space.low[i])/discrete_num))
    return dis_state


if __name__ == "__main__":
    env = gym.make('CartPole-v0')
    RL = QLearningTable(actions=list(range(env.action_space.n)))
    state_max, state_min = get_range()
    update()
    
Пример #12
0
def update():
    start = time.time()
    RL = QLearningTable(n_states=nodes_num,
                        each_services_nums=each_services_nums,
                        max_services_num=max_services_num,
                        nodeSet_file=nodeSet_file,
                        conf_file=conf_file,
                        learning_rate=ALPHA,
                        reward_decay=GAMMA,
                        e_greedy=EPSILON)
    max_reward = 0

    for episode in range(MAX_EPISODES):
        # initial observation
        state = 0
        # print("episode = {}".format(episode))

        while True:
            # RL choose action based on observation
            action = RL.choose_action(state)

            # RL take action and get next observation and reward
            state_, reward, done = RL.step(state, action)

            # print("s = {0}, a = {1}, s_ = {2}, reward = {3}".format(
            #     state, action, state_, reward
            # ))

            # RL learn from this transition
            RL.learn(state, action, reward, state_)

            # swap observation
            state = state_

            # break while loop when end of this episode
            if done:
                # print("services = {0}, reward = {1}, runtime = {2}, episode = {3} ".format
                #       (RL.choose_services, reward, time.time()-start, episode))
                if episode == 0:
                    max_reward = reward
                else:
                    if reward > max_reward:
                        max_reward = reward
                        print(
                            "services = {0}, reward = {1}, runtime = {2}, episode = {3} "
                            .format(RL.choose_services, reward,
                                    time.time() - start, episode))
                        line = [x for x in RL.choose_services]
                        line.append(reward)
                        line.append(time.time() - start)
                        line.append(episode)
                        # print(line)
                        fp = open(outfile, 'a+')
                        fp.write(str(line) + '\n')
                        fp.close()
                    else:
                        if episode % 100 == 0:
                            print("episode = {}".format(episode))
                break

        # 终止条件
        if episode >= ERROR_COUNT:
            del judge_list[0]
        judge_list.append(reward)

        if episode >= 1000 and episode % ERROR_COUNT == 0:
            if max(judge_list) - min(judge_list) <= ERROR_RANGE:
                output = "\n  达到收敛条件,提前终止实验!\n"
                line = [x for x in RL.choose_services]
                line.append(reward)
                line.append(time.time() - start)
                line.append(episode)
                # 打印收敛结果
                print(output)
                print(line)
                # 记录收敛结果
                fp = open(outfile, 'a+')
                fp.write(output)
                fp.write(str(line) + '\n')
                fp.close()
                break

    print('game over')
            RL.learn(str(s), action, reward, str(s_), Text, tot_action,
                     len(granul))

            #time.sleep(3)
            #print(s, action, reward, s_)

            s = s_  # break while loop when end of this episode

            if done == True:
                break

        #print("Episode Over")

        # end of game
    '''
    print('Game Over, Best Reward Ever:', "%.2f%%" % Perc_Best, Text)
    End_Time = datetime.datetime.now().strftime('%m-%d %H:%M')
    print("Started Time: " + Start_Real_Time + ", End Time: " + End_Time)
    rp.plot_legend_text(Time_Best, Light_Best, Light_Feed_Best, Action_Best, r_Best, perf_Best, SC_Best, SC_Best_norm_hist, SC_Feed_Best, Occup_Best, Text, best_reward, tot_episodes)
    rp.plot_reward_text(Tot_Episodes, Tot_Reward, Text, best_reward, tot_episodes)
    '''


if __name__ == "__main__":
    #env = Maze()
    #RL = QLearningTable(actions=list(range(env.n_actions)))
    RL = QLearningTable(actions=list(range(tot_action)))
    update()
    #env.after(100, update)
    #env.mainloop()
Пример #14
0
                    # RL take action and get next observation and reward
                    observation_, reward, done = env.step(
                        observation, eval(action))

                    RL.learn(str(observation), str(action), reward,
                             str(observation_))
                    # swap observation
                    observation = observation_

                    # # break while loop when end of this episode
                    if done:
                        isRunning = False
                        print(episode, len(RL.q_table.index))
                        break
                        # RL learn from this transition

    except KeyboardInterrupt:
        # RL.q_table.to_pickle("./Data/dataframe.pk1")
        sys.exit()
    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable(actions=env.action_space)
    env.after(1, update)

env.mainloop()
Пример #15
0
import gym
from RL_brain import QLearningTable

env = gym.make('MountainCar-v0')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = QLearningTable(actions=list(range(3)))
total_steps = 0

for i_episode in range(10):
    observation = env.reset()
    ep_r = 0
    while True:
        env.render()
        action = RL.choose_action(str(observation))
        observation_, reward, done, info = env.step(action)
        position, velocity = observation_
        # the higher the better
        reward = abs(position - (-0.5))  # r in [0, 1]
        RL.learn(str(observation), action, reward, str(observation_))
        ep_r += reward
        if done:
            get = '| Get' if observation_[
                0] >= env.unwrapped.goal_position else '| ----'
            print('Epi: ', i_episode, get, '| Ep_r: ', round(ep_r, 4),
                  '| Epsilon: ', round(RL.epsilon, 2))
Пример #16
0
    cost.append(cost_)
    density.append(density_)
    if find_target_node_:
        num_find_target += 1
    opt_cost.append(opt_cost_)
    return cost, density, num_find_target, opt_cost


if __name__ == "__main__":
    # r = input('times: ')
    r = '50000'
    save_list = [100, 50000]
    # ,10000,50000,100000,200000,300000,400000,500000,600000,700000,800000,900000,1000000
    train = True
    env = envR(show=False)
    RL = QLearningTable(env.action_space, learning_rate=0.1)
    # step = 0
    # succ = 0
    # start = time.time()
    for episode in range(int(r)):
        pre_maps = env.reset()

        for i in range(100):

            action = RL.choose_action(str(pre_maps), train)

            reward, done, action_ = env.step(action)

            RL.learn(str(pre_maps), action, reward, str(env.get_maps()), done)

            pre_maps = env.get_maps()
Пример #17
0
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(str(observation))

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                #保存q_table时使用
                RL.save_q_table()
                break

    # end of game
    print('game over')
    print(RL.q_table)
    env.destroy()

if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)),read_save=False)
    env.after(100, update)
    env.mainloop()
Пример #18
0
##    en.rfcomm5()
##    rfcomm_0.start()
##    rfcomm_1.start()
##    rfcomm_2.start()
##    rfcomm_3.start()
##    rfcomm_4.start()
##    rfcomm_5.start(
##    rfcomm_0.join()
##    rfcomm_1.join()
 ##    rfcomm_2.join()
##    rfcomm_3.join()
##    rfcomm_4.join()
##    rfcomm_5.join()
##    print('time :',time.time() - s_t)
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))
    RL1 = QLearningTable(actions=list(range(env.n_actions)))
    RL2 = QLearningTable(actions=list(range(env.n_actions)))
    RL3 = QLearningTable(actions=list(range(env.n_actions)))
    RL4 = QLearningTable(actions=list(range(env.n_actions)))
    RL5 = QLearningTable(actions=list(range(env.n_actions)))
    #Thread_1 = threading.Thread(target = updeee)
    #env.mainloop()
    #env.after(100, update)
    #env.after(100, update1)
    #Thread_0.start()
    #Thread_1.start()
    #env.mainloop()
    #Thread_0.join()
    #Thread_1.join()
    env.after(100, update)
Пример #19
0
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(str(observation))

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))
    env.after(100, update)
    env.mainloop()
    RL.print_q_table()
Пример #20
0
class App:
    def __init__(self, master):
        self.master = master

        #        grid map setting
        self.grid_origx = 500
        self.grid_origy = 20
        self.grid_columnNum = 8
        self.grid_rowNum = 8
        self.grid_UNIT = 90
        self.maze_size = self.grid_columnNum * self.grid_rowNum
        #        define total training episodes
        self.episode = 1000
        #        define number of tests to run
        self.tests = 100
        #        set a small amount of delay (second) to make sure tkinter works properly
        #        if want to have a slower visulazation for testing, set the delay to larger values
        self.timeDelay = 0.005

        #       other initialization
        self.n_actions = 4
        self.outline = 'black'
        self.fill = None
        self.item_type = 0
        self.learning = False
        self.itemsNum = 0
        self.epsilon = 0.9
        self.Qtable_origx = self.grid_origx + 20 + (self.grid_columnNum +
                                                    1) * self.grid_UNIT
        self.Qtable_origy = self.grid_origy
        self.grid_origx_center = self.grid_origx + self.grid_UNIT / 2
        self.grid_origy_center = self.grid_origy + self.grid_UNIT / 2
        self.Qtable_gridIndex_dict = {}
        self.show_q_table = pd.DataFrame(columns=list(range(self.n_actions)),
                                         dtype=np.float64)
        self.origDist = 10
        self.agentCentre = np.array([[190, 180], [290, 180], [390, 180]])
        self.warehouseCentre = self.agentCentre+np.array([[0,self.grid_UNIT+self.origDist],\
                                [0,self.grid_UNIT+self.origDist],[0,self.grid_UNIT+self.origDist]])
        self.ObstacleCentre1 = np.array([[725, 515], [725, 335], [635, 695]])

        self.ObstacleCentre2 = np.array([[905, 245], [545, 245], [995, 605]])
        self.itemOrigPosition = []
        self.agentPosition_list = []
        self.warehousePostition_list = []
        self.ObstaclePosition_list = []
        self.WarehouseItemIndex = []
        self.agentItemIndex = []
        self.ObstacleItemIndex = []
        self.AllItemsOrigPosition_list = []
        self.createMark = None
        self.points = []
        self.cars_list = []
        self.selected_agent = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []
        self.selected_Obstacles = []
        self.selected_targets = []
        self.agent = 1
        self.target = 4
        self.hell1 = 7
        self.hell2 = 8
        self.init_widgets()
        self.temp_item = None
        self.temp_items = []
        self.choose_item = None
        self.created_line = []
        self.lines = []

    def resize(self, w, h, w_box, h_box, pil_image):
        ''''' 
      resize a pil_image 
      '''
        return pil_image.resize((w_box, h_box), Image.ANTIALIAS)

    def init_widgets(self):

        self.cv = Canvas(root, background='white')
        self.cv.pack(fill=BOTH, expand=True)
        # bind events of dragging with mouse
        self.cv.bind('<B1-Motion>', self.move)
        self.cv.bind('<ButtonRelease-1>', self.move_end)
        self.cv.bind("<Button-1>", self.leftClick_handler)

        # bind events of double-left-click
        self.cv.bind("<Button-3>", self.rightClick_handler)
        f = ttk.Frame(self.master)
        f.pack(fill=X)
        self.bns = []

        # initialize buttons
        for i, lb in enumerate(
            ('Reset', 'Start trainning', 'Close', 'Save', 'Start Running')):
            bn = Button(f, text=lb, command=lambda i=i: self.choose_type(i))
            bn.pack(side=LEFT, ipadx=8, ipady=5, padx=5)
            self.bns.append(bn)
        self.bns[self.item_type]['relief'] = SUNKEN

        #initialize agent, warehouses and obstacles positions
        self.agentPosition_list = self.setItemsPositionList(self.agentCentre)
        self.warehousePostition_list = self.setItemsPositionList(
            self.warehouseCentre)
        self.ObstaclePosition_list1 = self.setItemsPositionList(
            self.ObstacleCentre1)
        self.ObstaclePosition_list2 = self.setItemsPositionList(
            self.ObstacleCentre1)
        self.ObstaclePosition_list = self.ObstaclePosition_list1 + self.ObstaclePosition_list2
        self.create_items()
        self.itemsNum = self.warehouseCentre.shape[
            0] + self.ObstacleCentre1.shape[0] + self.ObstacleCentre2.shape[
                0] + self.agentCentre.shape[0]
        R = self.grid_UNIT
        self.cv.create_text(self.agentCentre[0][0]-R-20,self.agentCentre[0][1],\
                            text = "Agent:",font=('Courier',18))
        self.cv.create_text(self.warehouseCentre[0][0]-R-20,self.warehouseCentre[0][1],\
                            text = "Warehouse:",font=('Couried',18))
        self.cv.create_text(self.grid_origx+250,self.grid_origy-50, text = "Single agent Q-Learning Simulation",\
                            font=('Times',38),fill = 'red')
        self.cv.create_text(self.grid_origx+252,self.grid_origy-52, text = "Single agent Q-Learning Simulation",\
                            font=('Times',38),fill = 'green')

        #draw grids
        self.create_grids(self.grid_origx, self.grid_origy,
                          self.grid_columnNum, self.grid_rowNum,
                          self.grid_UNIT)

        for i in range(0, self.grid_rowNum):
            for j in range(0, self.grid_columnNum):
                x = i * self.grid_UNIT + self.grid_origx_center
                y = j * self.grid_UNIT + self.grid_origy_center
                rowIndex = (y - self.grid_origy_center) / self.grid_UNIT
                columnIndex = (x - self.grid_origx_center) / self.grid_UNIT
                self.Qtable_gridIndex_dict[(
                    x, y)] = rowIndex * self.grid_columnNum + columnIndex

        print(self.Qtable_gridIndex_dict)

    def create_ObsItems(self):
        self.cv.arriveObsImage = []
        self.cv.bms_obs = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('obs5.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image1 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image1)

        pil_image = Image.open('obs7.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image2 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image2)

        pil_image = Image.open('obs8.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image3 = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_obs.append(tk_image3)

        self.cv.bms_obs.append(tk_image1)
        self.cv.bms_obs.append(tk_image2)
        self.cv.bms_obs.append(tk_image3)

        self.cv.Obstacle = []
        index = 0
        for q in self.ObstacleCentre1:
            bm = self.cv.bms_obs[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.Obstacle.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1
        for q in self.ObstacleCentre2:
            bm = self.cv.bms_obs[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.Obstacle.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

        #arriving picture
        pil_image = Image.open('obs5_car.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.arriveObsImage.append(tk_image)

    def create_targetItems(self):
        self.cv.arriveImage = []
        self.cv.bms_wh = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('warehouse4_1.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        pil_image = Image.open('warehouse3.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        pil_image = Image.open('warehouse4_2.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms_wh.append(tk_image)

        self.cv.warehouse = []
        index = 0
        for q in self.warehouseCentre:
            bm = self.cv.bms_wh[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.warehouse.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

        #arriving picture
        pil_image = Image.open('warehouse3_car.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.arriveImage.append(tk_image)

    def create_agentItems(self):
        self.cv.bms = []
        w_box, h_box = self.grid_UNIT, self.grid_UNIT

        pil_image = Image.open('car9.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        pil_image = Image.open('car2.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        pil_image = Image.open('car8.jpg')
        w, h = pil_image.size
        pil_image_resized = self.resize(w, h, w_box, h_box, pil_image)
        tk_image = ImageTk.PhotoImage(pil_image_resized)
        self.cv.bms.append(tk_image)

        self.cv.car = []
        index = 0
        for q in self.agentCentre:
            bm = self.cv.bms[index]
            t = self.cv.create_image(q[0], q[1], image=bm)
            self.cv.car.append(t)
            self.AllItemsOrigPosition_list.append(self.cv.coords(t))
            index += 1

    def setItemsPositionList(self, itemCentre):
        npTemp = np.hstack((itemCentre, itemCentre))
        #        print("npTemp=",npTemp)
        h_u = self.grid_UNIT / 2
        npHalfUnit = np.array([-h_u, -h_u, h_u, h_u])
        hs = npHalfUnit
        for diam in range(1, itemCentre.shape[0]):
            hsTemp = np.vstack((npHalfUnit, hs))
            hs = hsTemp


#            print("hs=",hs)
        return (npTemp - hs).tolist()

    def button_reset(self):
        time.sleep(self.timeDelay)
        if self.createMark is not None:
            self.cv.delete(self.createMark)
        for line in self.created_line:
            self.cv.delete(line)
        self.cv.coords(self.agent, self.selected_agent_position)

        coords = self.cv.coords(self.agent)
        return coords

    def reset(self):
        """
        reset the agent to a random valid location
        """
        if self.lines != []:
            for line in self.lines:
                self.cv.delete(line)
        Obs_list = self.ObstaclePosition_list
        while True:
            new_loc = [
                random.randrange(
                    self.grid_origx_center,
                    self.grid_rowNum * self.grid_UNIT + self.grid_origx_center,
                    self.grid_UNIT),
                random.randrange(
                    self.grid_origy_center,
                    self.grid_columnNum * self.grid_UNIT +
                    self.grid_origy_center, self.grid_UNIT)
            ]
            if new_loc not in Obs_list:
                break
        self.cv.coords(self.selected_agent[0], new_loc)
        coords = self.cv.coords(self.selected_agent[0])
        return coords

    def choose_best_action(self, state, terminal):
        """
        choose best action from Q_table
        """
        if terminal == self.cv.coords(self.target):
            q_table = self.q_table
        state_action = q_table.loc[state]
        action = np.random.choice(
            state_action[state_action == np.max(state_action)].index)
        return int(action)

    def run(self):
        """
        main function for runing tests
        """
        test = 0
        rewards = []
        action = -1
        observation = self.cv.coords(self.agent)
        done = 0
        total_reward = 0
        terminal = self.cv.coords(self.target)
        visited = [observation]
        #        enhance_list = []
        win_count = 0

        while True:
            self.labelHello = Label(self.cv,
                                    text="Test:%s" % str(test),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=self.agentCentre[0][0] - 150,
                                  y=self.agentCentre[0][1] + 500,
                                  anchor=NW)
            time.sleep(self.timeDelay)

            action = self.choose_best_action(str(observation), terminal)
            observation_ = self.calcu_next_state(observation, action)
            reward = self.new_reward(observation_, observation)

            if observation_ in visited:
                reward -= 0.5
            else:
                visited.append(observation_)

            if done:
                observation_ = self.cv.coords(self.target)

            self.cv.coords(self.selected_agent[0], observation_)
            total_reward += reward

            if total_reward < -1:
                done = 1

            if done != 1:
                line = self.cv.create_line(
                    observation[0],
                    observation[1],
                    observation_[0],
                    observation_[1],
                    fill='red',
                    arrow=LAST,
                    arrowshape=(10, 20, 8),  # 红色
                    dash=(4, 4)  # 虚线
                )
                self.lines.append(line)

            observation = observation_
            if self.cv.coords(self.agent) == self.cv.coords(self.target):
                done = 1

            if done:
                action = -1
                visited = []
                total_reward += 1
                if total_reward == 1:
                    win_count += 1
                rewards.append(total_reward)
                total_reward = 0
                self.reset()
                done = 0
                observation = self.cv.coords(self.agent)
                test += 1
            if test > self.tests:
                self.labelHello = Label(self.cv,
                                        text="running end!!",
                                        font=("Helvetica", 10),
                                        width=10,
                                        fg="red",
                                        bg="white")
                self.labelHello.place(x=250, y=750, anchor=NW)
                break
        print("win_count", win_count)
        plt.figure()
        plt.title('Score per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Score')
        plt.plot(rewards)
        plt.show()

    def render(self):
        time.sleep(self.timeDelay)

    def format_time(self, seconds):
        if seconds < 400:
            s = float(seconds)
            return "%.1f seconds" % (s, )
        elif seconds < 4000:
            m = seconds / 60.0
            return "%.2f minutes" % (m, )
        else:
            h = seconds / 3600.0
            return "%.2f hours" % (h, )

    def reward(self, s_, s):
        """
        rewarding scheme
        """
        self.target = self.selected_targets[0]
        if s_ == self.cv.coords(self.selected_targets[0]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMarkA = t
            reward = 1
            done = True

        elif s_ in self.selected_Obstacles_position:
            reward = -0.75
            done = False

        else:
            reward = -0.04
            done = False

        return reward, done

    def calcu_next_state(self, loc, action):
        """
        calculate next state based on location and action
        """
        UNIT = self.grid_UNIT
        ss = loc
        np_s = np.array(ss)
        dissS = np.array([self.grid_origx, self.grid_origy])
        s = (np_s - dissS).tolist()
        base_action = np.array([0, 0])
        if action == 0:  # up
            if s[1] > UNIT:
                base_action[1] -= UNIT
        elif action == 1:  # down
            if s[1] < (self.grid_rowNum - 1) * UNIT:
                base_action[1] += UNIT
        elif action == 2:  # right
            if s[0] < (self.grid_columnNum - 1) * UNIT:
                base_action[0] += UNIT
        elif action == 3:  # left
            if s[0] > UNIT:
                base_action[0] -= UNIT
        s_ = []
        s_ = [ss[0] + base_action[0], ss[1] + base_action[1]]
        return s_

    def new_reward(self, s_, s):
        """
        rewarding scheme for testing
        """
        if s_ == self.cv.coords(self.selected_targets[0]):
            t = self.cv.create_image(s_, image=self.cv.arriveImage[0])
            self.createMark = t
            reward = 0
        elif s_ in self.selected_Obstacles_position:
            reward = -2
        else:
            reward = 0
        return reward

    def update(self):
        """
        main function for training
        """
        self.RL = QLearningTable(actions=list(range(self.n_actions)),
                                 e_greedy=self.epsilon)
        episode = 0
        action = -1
        stepCount = 0
        total_reward_list = []
        avg_reward_list = []
        win_history = []
        observation = self.cv.coords(self.agent)
        visited = set()
        total_reward = 0
        start_time = datetime.datetime.now()
        self.labelHello = Label(self.cv,
                                text="start training!",
                                font=("Helvetica", 10),
                                width=10,
                                fg="red",
                                bg="white")
        self.labelHello.place(x=200, y=750, anchor=NW)
        while True:
            self.labelHello = Label(self.cv,
                                    text="episode: %s" % str(episode),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=200, y=550, anchor=NW)
            self.render()
            visited.add(tuple(observation))
            stepCount += 1
            action = self.RL.choose_action(str(observation))
            observation_ = self.calcu_next_state(observation, action)
            reward, done = self.reward(observation_, observation)
            self.cv.coords(self.selected_agent[0], observation_)

            if tuple(observation_) in visited:
                reward -= 0.25
            if observation == observation_:
                reward = reward - 0.8
            if done == True:
                win_history.append(1)
            total_reward += reward
            if total_reward < -0.5 * 64:
                done = True
                win_history.append(0)
            self.RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                if episode > self.episode:
                    break
                else:
                    observation = self.reset()
                    dt = datetime.datetime.now() - start_time
                    t = self.format_time(dt.total_seconds())
                    total_reward_list.append(total_reward)
                    if len(total_reward_list) > 100:
                        avg_reward = sum(total_reward_list[-100:]) / 100
                        avg_reward_list.append(avg_reward)
                        template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episode, self.episode, stepCount,
                                sum(win_history) / len(win_history),
                                total_reward, avg_reward, t))
                    else:
                        template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episode, self.episode, stepCount,
                                sum(win_history) / len(win_history),
                                total_reward, t))
                    episode += 1
                    stepCount = 0
                    total_reward = 0
                    visited = set()
                    done = 0

        # end of training
        print('training over!')
        self.labelHello = Label(self.cv,
                                text="training end!",
                                font=("Helvetica", 10),
                                width=10,
                                fg="red",
                                bg="white")
        self.labelHello.place(x=200, y=750, anchor=NW)
        print("total_win_rate", sum(win_history) / len(win_history))
        print("total_time", t)
        print("average rewards per episode",
              sum(total_reward_list) / len(total_reward_list))
        self.learning = False
        self.reset()
        plt.figure()
        plt.title('Rewards per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(total_reward_list)

        plt.show()

        plt.figure()
        plt.title('Average Rewards over 100 Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(avg_reward_list)

        plt.show()

    def create_items(self):
        self.AllItemsOrigPosition_list.append([0, 0, 0, 0])
        self.create_agentItems()
        self.agentItemIndex = [1, len(self.agentPosition_list)]
        self.create_targetItems()
        self.WarehouseItemIndex = [
            self.agentItemIndex[1] + 1,
            self.agentItemIndex[1] + len(self.warehousePostition_list)
        ]
        self.create_ObsItems()
        self.ObstacleItemIndex = [
            self.WarehouseItemIndex[1] + 1,
            self.WarehouseItemIndex[1] + len(self.ObstaclePosition_list)
        ]

    def create_grids(self, origx, origy, column, row, UNIT):
        # create grids
        for c in range(origx, origx + (column + 1) * UNIT, UNIT):
            x0, y0, x1, y1 = c, origy, c, origy + row * UNIT
            self.cv.create_line(x0, y0, x1, y1, width=2)
        for r in range(origy, origy + (row + 1) * UNIT, UNIT):
            x0, y0, x1, y1 = origx, r, origx + row * UNIT, r
            self.cv.create_line(x0, y0, x1, y1, width=2)

    def choose_type(self, i):
        """
        function of clicking different button
        """
        for b in self.bns:
            b['relief'] = RAISED
        self.bns[i]['relief'] = SUNKEN
        self.item_type = i
        if self.item_type == 1:
            #            start training
            self.start_learning()
            self.bns[i]['relief'] = RAISED
        elif self.item_type == 2:
            #            close simulation tool
            os._exit(0)
        elif self.item_type == 3:
            #           save q_table
            temp_s = str(self.cv.coords(self.target)) + str(
                self.selected_Obstacles_position)
            self.RL.q_table.to_csv("single_qtable_%s.csv" % temp_s,
                                   index_label="index_label")
            print("SAVED!!!")
            self.labelHello = Label(self.cv,
                                    text="table saved!!",
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="red",
                                    bg="white")
            self.labelHello.place(x=350, y=750, anchor=NW)
        elif self.item_type == 0:
            self.button_reset()
        elif self.item_type == 4:
            #            start running tests
            self.start_running()
        elif self.item_type == 5:
            self.restart()

    def start_learning(self):
        """
        initialization for training process
        """
        self.selected_agent = []
        self.selected_targets = []
        self.selected_Obstacles = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []

        for item in range(1, self.itemsNum + 1):

            p = self.cv.coords(item)

            if p[0] >= self.grid_origx and p[1] >= self.grid_origy:
                if item in range(self.agentItemIndex[0],
                                 self.agentItemIndex[1] + 1):
                    self.selected_agent.append(item)
                    self.selected_agent_position = p
                elif item in range(self.WarehouseItemIndex[0],
                                   self.WarehouseItemIndex[1] + 1):
                    self.selected_targets.append(item)
                elif item in range(self.ObstacleItemIndex[0],
                                   self.ObstacleItemIndex[1] + 1):
                    self.selected_Obstacles.append(item)
                    self.selected_Obstacles_position.append(p)

        if len(self.selected_agent) == 0 or len(self.selected_agent) > 1:
            tkinter.messagebox.showinfo(
                "INFO", "Please choose ONE agent for trainning!")
        elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1:
            tkinter.messagebox.showinfo(
                "INFO", "Please choose ONE target for trainning!")
        else:
            self.agent = self.selected_agent[0]
            self.target = self.selected_targets[0]

            self.t = threading.Timer(self.timeDelay, self.update)
            self.t.start()
            self.learning = True

    def start_running(self):
        """
        initialization for testing
        """
        self.selected_agent = []
        self.selected_targets = []
        self.selected_Obstacles = []
        self.selected_agent_position = []
        self.selected_Obstacles_position = []
        self.selected_targets_position = []

        for item in range(1, self.itemsNum + 1):
            p = self.cv.coords(item)
            if p[0] >= self.grid_origx and p[1] >= self.grid_origy:
                if item in range(self.agentItemIndex[0],
                                 self.agentItemIndex[1] + 1):
                    self.selected_agent.append(item)
                    self.selected_agent_position = p
                elif item in range(self.WarehouseItemIndex[0],
                                   self.WarehouseItemIndex[1] + 1):
                    self.selected_targets.append(item)
                    self.selected_targets_position = p
                elif item in range(self.ObstacleItemIndex[0],
                                   self.ObstacleItemIndex[1] + 1):
                    self.selected_Obstacles.append(item)
                    self.selected_Obstacles_position.append(p)

        if len(self.selected_agent) <= 0 or len(self.selected_agent) > 1:
            tkinter.messagebox.showinfo("INFO",
                                        "Please place ONE agent on map!")
        elif len(self.selected_targets) == 0 or len(self.selected_targets) > 1:
            tkinter.messagebox.showinfo("INFO", "Please choose ONE terminal!")
        else:
            self.agent = self.selected_agent[0]
            self.target = self.selected_targets[0]

            #            load Q table
            terminal_str = str(self.selected_targets_position) + str(
                self.selected_Obstacles_position) + 'episode3000'
            self.q_table = pd.read_csv("table terminal%s.csv" % terminal_str,
                                       index_col=0)
            self.t = threading.Timer(self.timeDelay, self.run)
            self.t.start()
            self.learning = True

    def rightClick_handler(self, event):
        self.start_learning()

    def leftClick_handler(self, event):
        """
        bind events of choosing warehouse
        """

        if self.learning:
            print("Learing on going!")
        else:
            for item in range(1, self.itemsNum + 1):
                position = self.cv.coords(item)
                R = self.grid_UNIT / 2
                p = [
                    position[0] - R, position[1] - R, position[0] + R,
                    position[1] + R
                ]
                if event.x>=p[0] and event.x<=p[2] and \
                    event.y>=p[1] and event.y<=p[3]:
                    t = item

                    self.choose_item_handler(event, t)

    def choose_item_handler(self, event, t):

        self.choose_item = t

        self.itemOrigPosition = self.cv.coords(t)

    def move(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            self.cv.coords(t, event.x, event.y)

    def adjust_items_into_grids(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            position = self.cv.coords(t)
            centerX = position[0]
            centerY = position[1]
            Grids_X0 = self.grid_origx
            Grids_X1 = self.grid_origx + (self.grid_columnNum +
                                          1) * self.grid_UNIT
            Grids_Y0 = self.grid_origy
            Grids_Y1 = self.grid_origy + (self.grid_rowNum +
                                          1) * self.grid_UNIT
            if (centerX in range(Grids_X0, Grids_X1)) and (centerY in range(
                    Grids_Y0, Grids_Y1)):
                columnIndex = math.floor((centerX - Grids_X0) / self.grid_UNIT)
                rowIndex = math.floor((centerY - Grids_Y0) / self.grid_UNIT)
                adjustedX0 = Grids_X0 + columnIndex * self.grid_UNIT + self.grid_UNIT / 2
                adjustedY0 = Grids_Y0 + rowIndex * self.grid_UNIT + self.grid_UNIT / 2
                self.cv.coords(t, adjustedX0, adjustedY0)
            else:
                #return to original position if not drag near grids
                self.cv.coords(t, self.AllItemsOrigPosition_list[t])
                self.itemOrigPosition = []

    def move_end(self, event):
        if self.choose_item is not None:
            t = self.choose_item
            self.adjust_items_into_grids(event)
            self.choose_item = None

    def delete_item(self, event):
        if self.choose_item is not None:
            self.cv.delete(self.choose_item)
Пример #21
0
            # break while loop when end of this episode
            if done:
                reward_list.append(r)
                break

    # end of game
    print('game over')
    # env.destroy()


if __name__ == "__main__":
    env = MDP_env()
    n_actions = 2
    n_features = 12
    reward_list = []
    RL = QLearningTable(n_features, actions=list(range(n_actions)))
    episode_memories = defaultdict(list)
    update()
    av_reward = [
        np.mean(reward_list[0:i + 1]) for i in range(len(reward_list))
    ]
    plt.plot(np.arange((len(reward_list))), av_reward)
    plt.show()
    np.set_printoptions(suppress=True)
    RL.q_table['a'] = RL.q_table.idxmax(axis=1)
    print(
        np.hstack([
            np.arange(6).reshape(6, 1),
            RL.q_table.sort_index(axis=0, ascending=True).values
        ]))
    print("")
Пример #22
0
    for i in range(0, int(num_days) * 10):
        if i == 0:
            use_new_table = True
        else:
            use_new_table = False
        Text_Table = Text

        if day % 10 == 0:
            learn_single_day = False
            epsilon = 0.1
            start_day = very_start_day
            end_day = day
        else:
            learn_single_day = True
            epsilon = 1
            start_day = day - 5
            end_day = start_day + 10
        Text = ''.join(str(elem) for elem in Text_list)
        print("Day: ", str(day / 10), ", Exp name: ", Text,
              ", Use new Table: ", use_new_table, ", epsilon: ", epsilon,
              ", Train(False)/Test(true): ", learn_single_day)
        time.sleep(5)
        RL = QLearningTable(actions=list(range(tot_action)),
                            Text=Text,
                            Text_Table=Text_Table,
                            use_new_table=use_new_table,
                            epsilon=epsilon)
        update(start_day, end_day)
        day += 5
        Text_list[-1] = day
Пример #23
0
            else:
                is_hell = False
            step_num += 1

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy(
    )  # after operating 100 times game over and destroy the environment


# 相关注释网站https://cloud.tencent.com/developer/article/1148483
# https://blog.csdn.net/duanyajun987/article/details/78614902
if __name__ == "__main__":
    env = Maze()  # 使用tkinter 初始化和创建环境
    RL = QLearningTable(actions=list(range(env.n_actions)))  # 定义强化学习类,初始化相关参数

    env.after(100, update)  # call update() after 100ms
    # update()
    env.mainloop()  #
    # update()  #放在后面就用不了
Пример #24
0
if __name__ == "__main__":
    df_re = pd.read_csv(os.path.dirname(os.getcwd()) + "/dataset/" +
                        configuration.CITY + '_public_node_relation.csv',
                        encoding='utf-8')
    df_co = pd.read_csv(os.path.dirname(os.getcwd()) + "/dataset/" +
                        configuration.CITY + '_node&tel.csv',
                        encoding='utf-8')
    x = df_co['lon'].round(decimals=6).tolist()
    y = df_co['lat'].round(decimals=6).tolist()
    cross_relation = tools.get_cross_info(df_re)
    cross_info = df_co.values.tolist()
    next_state_list, distance_list, action_list, tel_list = tools.get_details(
        cross_relation)

    # TODO Start_Point & End_Point 待输入
    for i in range(166, 288):
        np.random.seed(i)
        start_point = np.random.randint(0, 800)
        end_point = np.random.randint(801, 1725)
        RL = QLearningTable(ACTIONS)
        env = Cross(next_state_list, action_list, distance_list, start_point,
                    end_point, cross_info)
        q_table = update(env, start_point, end_point)
        q_table.to_csv(os.getcwd() + '/table/' + configuration.CITY + '_' +
                       str(start_point) + '_' + str(end_point) + '_' +
                       'q_table.csv',
                       encoding="utf-8")
    # df_q_tb = pd.read_csv(os.getcwd() + '/table/0_363_q_table.csv', encoding='utf-8')
    # q_table = df_q_tb.values.tolist()
Пример #25
0
def update():
    for episode in range(100):
        # initial observation
        observation = 

        while True:
            # RL choose action based on observation
            action = RL.choose_action(str(observation))

            # RL take action and get next observation and reward
            observation_, reward, done = handle.ApplyForceOnJoint(action)

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('Model_ready')
    

if __name__ == "__main__":
    Model = handle()
    RL = QLearningTable(actions=list(range(Model.n_actions)))
    update()
import environment
from RL_brain import QLearningTable
import numpy as np

env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))

N = 20
dt = 2 * np.pi / N
ep_max = 500
fidelity = np.zeros(ep_max)

RL = QLearningTable(actions=list(range(env.n_actions)))
fid_10 = 0
for episode in range(ep_max):
    observation = env.reset()
    while True:

        action = RL.choose_action(str(observation))
        observation_, reward, done, fid = env.step(action)
        RL.learn(str(observation), action, reward, str(observation_))
        observation = observation_
        if done:
            if episode >= ep_max - 11:
                fid_10 = max(fid_10, fid)

            break

print('Final_fidelity=', fid_10)
Пример #27
0
# -*- coding: utf-8 -*-
from maze_env import Maze
from RL_brain import QLearningTable


def update():
    for episode in range(100):
        observation = env.reset()
        while True:
            env.render()
            action = RL.choose_action(str(observation))
            observation_, reward, done = env.step(action)
            RL.learn(str(observation), action, reward, str(observation_))
            observation = observation_
            if done:
                break
    print 'game over'
    env.destroy()


env = Maze()
RL = QLearningTable(actions=list(range(env.n_actions)))
env.after(100, update)
env.mainloop()
Пример #28
0
    def update(self):
        """
        main function for training
        """
        self.RL = QLearningTable(actions=list(range(self.n_actions)),
                                 e_greedy=self.epsilon)
        episode = 0
        action = -1
        stepCount = 0
        total_reward_list = []
        avg_reward_list = []
        win_history = []
        observation = self.cv.coords(self.agent)
        visited = set()
        total_reward = 0
        start_time = datetime.datetime.now()
        self.labelHello = Label(self.cv,
                                text="start training!",
                                font=("Helvetica", 10),
                                width=10,
                                fg="red",
                                bg="white")
        self.labelHello.place(x=200, y=750, anchor=NW)
        while True:
            self.labelHello = Label(self.cv,
                                    text="episode: %s" % str(episode),
                                    font=("Helvetica", 10),
                                    width=10,
                                    fg="blue",
                                    bg="white")
            self.labelHello.place(x=200, y=550, anchor=NW)
            self.render()
            visited.add(tuple(observation))
            stepCount += 1
            action = self.RL.choose_action(str(observation))
            observation_ = self.calcu_next_state(observation, action)
            reward, done = self.reward(observation_, observation)
            self.cv.coords(self.selected_agent[0], observation_)

            if tuple(observation_) in visited:
                reward -= 0.25
            if observation == observation_:
                reward = reward - 0.8
            if done == True:
                win_history.append(1)
            total_reward += reward
            if total_reward < -0.5 * 64:
                done = True
                win_history.append(0)
            self.RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                if episode > self.episode:
                    break
                else:
                    observation = self.reset()
                    dt = datetime.datetime.now() - start_time
                    t = self.format_time(dt.total_seconds())
                    total_reward_list.append(total_reward)
                    if len(total_reward_list) > 100:
                        avg_reward = sum(total_reward_list[-100:]) / 100
                        avg_reward_list.append(avg_reward)
                        template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | Average rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episode, self.episode, stepCount,
                                sum(win_history) / len(win_history),
                                total_reward, avg_reward, t))
                    else:
                        template = "Episode: {:03d}/{:d} | StepCount: {:d} | Win rate: {:.3f} | Total rewards: {:.3f} | time: {}"
                        print(
                            template.format(
                                episode, self.episode, stepCount,
                                sum(win_history) / len(win_history),
                                total_reward, t))
                    episode += 1
                    stepCount = 0
                    total_reward = 0
                    visited = set()
                    done = 0

        # end of training
        print('training over!')
        self.labelHello = Label(self.cv,
                                text="training end!",
                                font=("Helvetica", 10),
                                width=10,
                                fg="red",
                                bg="white")
        self.labelHello.place(x=200, y=750, anchor=NW)
        print("total_win_rate", sum(win_history) / len(win_history))
        print("total_time", t)
        print("average rewards per episode",
              sum(total_reward_list) / len(total_reward_list))
        self.learning = False
        self.reset()
        plt.figure()
        plt.title('Rewards per Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(total_reward_list)

        plt.show()

        plt.figure()
        plt.title('Average Rewards over 100 Episode')
        plt.xlabel('Episode number')
        plt.ylabel('Rewards')
        plt.plot(avg_reward_list)

        plt.show()
Пример #29
0
from matplotlib import rcParams
rcParams['font.family'] = 'serif'
rcParams['font.serif'] = ['Times New Roman']

###############################read data###################################
df = pd.read_csv('YearData.csv')
D = df['demand'].values
P = df['Price Data'].values

for BAR_POR in [1]:
    B = 8820 * BAR_POR
    XIN = 0.7
    FUL = 1
    W = 2
    LEVEL = np.array([20, 2000, 2000])
    RL = QLearningTable(actions=list(range(3)))
    BF = BFramework(XIN, FUL, LEVEL, B, W)
    socour_o, socrl_o, socmpc_o, socnos_o, socthb_o, socofl_o = BF.general_performance_sys(
        D, P)

    socour_ori = socour_o[0, :, :]
    socrl_ori = socrl_o[0, :, :]
    socmpc_ori = socmpc_o[0, :, :]
    socnos_ori = socnos_o[0, :, :]
    socthb_ori = socthb_o[0, :, :]
    socofl_ori = socofl_o[0, :, :]

    socour = np.zeros((731, 2))
    socrl = np.zeros((731, 2))
    socmpc = np.zeros((731, 2))
    socnos = np.zeros((731, 2))
Пример #30
0
    print('update')
    for episode in range(100):
        # initialize observation
        observation = env.reset()

        while True:
            # fresh env
            env.render()
            action = RL.choose_action(str(observation))
            observation_, reward, done = env.step(action)

            RL.learn(str(observation), action, reward, str(observation_))

            observation = observation_

            if done:
                break
    print('game over')

    env.destroy()


if __name__ == '__main__':
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)),e_greedy=0.9)
    print(RL.q_table)

    env.after(100, update)

    env.mainloop()