def DQN():
    import tensorflow as tf
    from DQN import DeepQNetwork
    import numpy as np

    game.restart_game()

    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    dqn = DeepQNetwork(sess, game)

    game_state = game.current_state()

    start_state = np.concatenate(
        (game_state, game_state, game_state, game_state), axis=2)
    s_t = start_state

    while not game.game_end():
        # choose an action epsilon greedily
        _, action_index = dqn.choose_action(s_t)

        move = action_index
        game.do_move(move)

        pygame.event.pump()

        game_state = game.current_state()
        s_t = np.append(game_state, s_t[:, :, :-2], axis=2)

        screen.fill(black)

        game.snake.blit(rect_len, screen)
        game.strawberry.blit(screen)
        game.blit_score(white, screen)

        pygame.display.flip()

        fpsClock.tick(15)

    crash()
示例#2
0
def main():
    env = RideHitch("data/norm1000.txt")
    print(env.requests_list)
    RL = DeepQNetwork(env.pool_size,
                      env.state_num,
                      learning_rate=0.01,
                      reward_decay=0.99,
                      e_greedy=1,
                      replace_target_iter=200,
                      memory_size=2000,
                      output_graph=False,
                      T=env.T_threshold,
                      D=env.D_threshold)
    step = 0
    matched_list = []
    for episode in range(100):
        # init
        observation = env.reset(reset_seq=False)
        # if episode % 100 == 0:
        #     print(episode)
        matched = 0
        print("seq size:", env.request_num, "pool size:", env.pool_size)
        while True:
            action = RL.choose_action(observation)
            observation_, reward, done = env.step(action)
            if reward > 0:
                matched += 1
            RL.store_transition(observation, action, reward, observation_)
            if (step > 200) and (step % 5 == 0):
                RL.learn()
            observation = observation_
            if done:
                break
            step += 1
        matched_list.append(matched)
        print("eps", episode, "matching", matched)
    # print(matched_list)
    RL.plot_cost()
示例#3
0
                      learning_rate=0.0001,
                      reward_decay=0.9,
                      e_greedy=0.75,
                      replace_target_iter=2000,
                      memory_size=MEMORYCAPACITY,
                      batch_size=64
                      # output_graph=True
                      )
    RL.restore_model()
    for episode in range(EPS):
        env.build_map()
        value = 0

        for step in range(STEP):
            state = env.state.copy()
            action = RL.choose_action(state)
            env.step(action_space[action])
            state_ = env.state.copy()
            reward, dist = compute_reward(state, state_)

            RL.store_transition(state, action, reward, state_)
            value += reward
            if dist < DIST:
                break

            if RL.memory_counter > MEMORYCAPACITY:
                RL.learn()

        if (episode + 1) % 100 == 0:
            env.display2D()
            print episode + 1
示例#4
0
    for it in range(250):
        for i in range(1000):
            #read data
            curr_task = read_task(i, df_task_usage)
            env.update_env(last_input_time, curr_task.input_time)

            # if str(job_ID) not in jobs:
            #     curr_job = env.Job(job_ID)
            #     jobs[str(job_ID)] = cur_job

            #need a hashmap tp match the id to the job
            #curr_job.tasks.append(curr_task)
            observation_1_new = get_observation_1(curr_task, env)

            #stage_1
            action1 = RL_farm.choose_action(observation_1_new)
            farm_id, waiting_time = env.step_farm(action1)
            curr_server_farm = env.serverfarms[farm_id]

            #stage_2
            observation_2_new = get_observation_2(curr_task, curr_server_farm)
            action2 = RL_server[farm_id].choose_action(observation_2_new)
            server_id = int(action2)
            curr_server = curr_server_farm.servers[server_id]

            #check hard_deadline & cpu,memory
            curr_task.start_time = curr_task.input_time + waiting_time
            curr_task.end_time = curr_task.start_time + curr_task.execution_time

            print("----------------------")
            print("start_time : " + str(curr_task.start_time))
示例#5
0



best_reward=0
best_pp=None
reward_list=[]
for episode in range(nEpisodes):
    # DQN
    observation, info = env.reset()
    # frame=info["frame"]
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(action_transform(observation))

        observation_, reward, done, info = env.step(action)

        
        RL.store_transition(action_transform(observation), action, reward, action_transform(observation_))

        ep_r += reward
        if total_steps > 1000:
            RL.learn()

        if done:
            print('episode: ', episode,
                  'ep_r: ', round(ep_r, 2),
                  ' epsilon: ', round(RL.epsilon, 2))
            break
                  replace_target_iter=100,
                  memory_size=2000,
                  e_greedy_increment=.001,
                  )

total_steps = 0

for episode in range(1000):
    observation = env.reset()

    ep_r = 0

    while True:
        env.render()

        action = RL.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        x, x_dot, theta, theta_dot = observation_
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - .8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - .5
        reward = r1 + r2

        RL.store_transition(observation, action, reward, observation_)

        ep_r += reward

        if total_steps > 1000:
            RL.learn()

        if done:
            print("episode: ", episode, "ep_r",round(ep_r, 2),
示例#7
0
class LTDQN(Approach):
    def __init__(self,
                 budget,
                 times,
                 users,
                 n_scope,
                 r_interval=0.01,
                 isTrain=True):
        Approach.__init__(self, budget, times, users)
        self.n_scope = n_scope
        self.state_dim = 8
        self.action_dim = 9
        self.r_interval = r_interval
        if isTrain:
            self.dqn = DeepQNetwork(self.action_dim, self.state_dim)
        else:
            self.dqn = DeepQNetwork(self.action_dim,
                                    self.state_dim,
                                    e_greedy_increment=None)

    def generate_reward(self, action, user):
        if action == 1:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
        elif action == 2:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
        elif action == 3:
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 4:
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1
        elif action == 5:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 6:
            user.default_single_r += self.r_interval
            if user.default_single_r > 1.:
                user.default_single_r = 1.
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1
        elif action == 7:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
            user.default_num += 1
            if user.default_num > self.n_scope:
                user.default_num = self.n_scope
        elif action == 8:
            user.default_single_r -= self.r_interval
            if user.default_single_r < 0.:
                user.default_single_r = 0.
            user.default_num -= 1
            if user.default_num < 1:
                user.default_num = 1

    def simulate(self):
        self.dqn.load()
        for ep in range(1):
            # self.users = self.init_users_list()
            total_benefits = 0.
            total_expense = 0.
            for time in range(self.times):
                total_affected_num = 0
                total_req_num = 0.
                for user in self.users:
                    if user.finished == 0:
                        if self.budget > 0:
                            output = self.dqn.choose_action(user.state)
                            self.generate_reward(output, user)
                            # if user.default_single_r >= 0.5:
                            user.receive_offer(user.default_single_r,
                                               user.default_num, output)
                            # else:
                            #     user.receive_offer(0, user.default_num, output)
                            self.budget -= user.r
                        else:
                            user.receive_offer(0., 0, 0)

                    total_req_num += user.req_num
                    action, benefits, reward, done = user.choose_action()

                    if done:
                        if user.finished == 0:
                            self.budget += user.r
                        user.reset_status()
                        # self.dqn.store_transition(user.state, action, reward, user.state_)
                        user.state = user.state_.copy()
                        # self.dqn.learn()
                    if user.action == len(user.preference) - 1:
                        total_affected_num += 1
                    if benefits > 0:
                        total_benefits += benefits
                    total_expense += benefits / (1. - benefits + 0.001)
                if (time + 1) % self.interval == 0:
                    self.affected_users_num.append(total_affected_num)
                    self.total_benefits.append(total_benefits)
                    self.average_req_num.append(total_req_num /
                                                len(self.users))
                    self.ratio.append(total_expense)
                print(
                    "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" %
                    (ep, time, self.budget, total_benefits),
                    end=' ')
            print()

    def init_users_list(self):
        user_list = []
        arr = np.loadtxt('../dataset/test.txt', delimiter=' ')
        # arr = arr[0:2000, :]  # train
        # print(arr)
        total_cost = 0.
        for row in range(arr.shape[0]):
            data = arr[row, :]
            # print(data[0])
            user = User(row, float(data[0]), data[1:])
            total_cost += user.preference[np.argmax(
                user.preference)] - user.preference[-1]
            user_list.append(user)
        print(len(user_list), total_cost)
        return user_list

    def train(self):
        for ep in range(500):
            self.budget = 50000
            self.users = self.init_users_list()
            # self.users = self.init_users_list()
            self.dqn.epsilon = 0
            total_benefits = 0.
            for time in range(self.times):
                total_affected_num = 0
                total_req_num = 0.
                for user in self.users:
                    if user.finished == 0:
                        if self.budget > 0:
                            output = self.dqn.choose_action(user.state)
                            self.generate_reward(output, user)
                            user.receive_offer(user.default_single_r,
                                               user.default_num, output)
                            self.budget -= user.r
                        else:
                            user.receive_offer(0., 0, 0)

                    total_req_num += user.req_num
                    action, benefits, reward, done = user.choose_action()

                    if done:
                        if user.finished == 0:
                            self.budget += user.r
                        user.reset_status()
                        # print(user.state_, user.state)
                        self.dqn.store_transition(user.state, action, reward,
                                                  user.state_)
                        user.state = user.state_.copy()
                        self.dqn.learn()

                    if user.action == len(user.preference) - 1:
                        total_affected_num += 1
                    if benefits > 0:
                        total_benefits += benefits
                if (time + 1) % self.interval == 0:
                    self.affected_users_num.append(total_affected_num)
                    self.total_benefits.append(total_benefits)
                    self.average_req_num.append(total_req_num /
                                                len(self.users))
                print(
                    "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" %
                    (ep, time, self.budget, total_benefits),
                    end=' ')
                if self.budget <= 0:
                    break
            print()
            self.dqn.save()
示例#8
0
class view(tkinter.Tk):
    def __init__(self):
        self.gameStart=False
        self.status=False
        self.reward=0
        super(view, self).__init__()
        self.n_actions = 361    #定义动作的可能个数
        self.n_features = 361
        self.doneList=[]
        self.allphoto=[]
        self.initView()
        self.env=env()
        self.wobservation=None
        self.wobservation_=None
        self.action1=None
        self.RL = DeepQNetwork(self.n_actions, self.n_features )

    def callback(self,event):
        if self.gameStart:
            mouse_x = event.x
            mouse_y = event.y
            if 590 > mouse_x > 20 and 590 > mouse_y > 20:
                # 横向为a,纵向为b
                a = round((mouse_x - 40) / 30)
                b = round((mouse_y - 40) / 30)
                action = b * 19 + a
                # self.env.qipan[b, a] = 2,非计算机方
                observation =self.getdouble(np.reshape(np.copy(self.env.qipan), [1, space]))
                bobservation=self.transfore(observation)
                qipan,observation_, reward, done=self.step(action, 'Black')
                bobservation_=self.transfore(observation_)
                print('人工下棋的reward:%d'%reward)
                self.RL.store_transition(bobservation, action, reward*1.5, bobservation_) #此处默认人的掷棋是最优的
                if done:
                    tkinter.messagebox.showinfo(title='提示', message='you win!!!1')
                    self.RL.learn(flag=2)
                    self.RL.saveavarriable()
                    self.RL.plot_cost()
                    self.gameStart=False
                # self.status = True
                #计算机选择动作
                self.bqipan=np.copy(self.env.qipan)
                wobservation = self.getdouble(np.reshape(self.bqipan,[1,space]))
                action1= self.RL.choose_action(self.bqipan,wobservation)     #这里让电脑选择下一步下
                bqipan_,wobservation_,reward,done=self.step(action1,'White')
                print('计算机下棋的reward:%d'%reward)
                self.RL.store_transition(observation, action, reward, observation_)
                if done:
                    tkinter.messagebox.showinfo(title='提示', message='you failure')
                    self.RL.saveavarriable()
                    self.RL.plot_cost()
                    self.gameStart = False

    def initView(self):
        def buttonCallBack():
            self.RL.getvarriable()
            self.gameStart = True
            if len(self.allphoto) > 0:

                for i in self.allphoto:
                    self.w.delete(i)

            self.allphoto.clear()
            self.doneList.clear()
            observation = self.env.reset()

        self.master = Tk()
        self.master.title("五子棋")
        self.master.resizable(width=False, height=False)
        self.w = Canvas(self.master, bg="#FFFFF0", width=700, height=630)
        for c in range(40, 610, 30):  # 竖向
            x0, y0, x1, y1 = c, 40, c, 580
            self.w.create_line(x0, y0, x1, y1)
        for r in range(40, 610, 30):
            x0, y0, x1, y1 = 40, r, 580, r
            self.w.create_line(x0, y0, x1, y1)
        Label(self.w, text=1, bg="#FFFFF0").place(x=5, y=5)
        x1 = 60
        y1 = 5
        for i in range(2, 20):
            Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1)
            x1 += 30
        x1 = 5
        y1 = 60
        for i in range(2, 20):
            Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1)
            y1 += 30
        Button(self.w, text="开始游戏", bg="yellow", activebackground="Black", command=buttonCallBack).place(x=610, y=500)
        self.w.bind("<Double-Button-1>", self.callback)
        self.w.pack()
        #self.master.mainloop()


    def show(self,action,flag):
        y=(action//19)*30+40
        x=(action%19)*30+40
        if flag=='Black':
            a=self.w.create_oval(x-14,y-14,x+14,y+14,fill="Black")
        elif flag=='White':
            a = self.w.create_oval(x-14, y-14, x+14, y+14, fill="White")
        self.allphoto.append(a)
        self.update()

    def setPosition(self,action,flag):
        if action in self.doneList:
            tkinter.messagebox.showinfo(title='提示', message='当前位置不可下')

        else:
            self.doneList.append(action)
            self.show(action,flag)

    def reset(self):
        if len(self.allphoto)>0:

            for i in self.allphoto:
                self.w.delete(i)
        self.allphoto.clear()
        self.doneList.clear()
        self.gameStart=False
        observation=self.env.reset()
        ob=self.getdouble(np.reshape(observation,[1,space]))
        return np.copy(self.env.qipan),ob


    #############################################
    def step(self,action,flag):
        # 根据不同的掷棋方,返回reward
        # print(flag)
        # print('ation:%d'%action)
        p1 = self.env.pwb(flag)
        p2 = self.env.pwn(action, flag)  # 走完后赢的可能性

        # print('落子前所得分数%d'%p1)
        # print('落子后所得分数%d'%p2)
        s=p2-p1
        # if s<=0:
        #     self.reward=0
        # elif 0<s<150:
        #     self.reward=300
        # elif 150<=s<800:
        #     self.reward=500
        # elif 800<=s<3500:
        #     self.reward=2000
        # elif 3500<=s<4800:
        #     self.reward=4000
        # elif s>4800:
        #     self.reward=6000

        print("该步的回报值:%d"%s)

        self.setPosition(action,flag)
        if(s==-120):
            time.sleep(10000)
        qipan=self.getdouble(np.reshape(np.copy(self.env.qipan),[1,space]))
        return np.copy(self.env.qipan),qipan,s,self.env.done


    def tryPosition(self,Ob,ation,flag):
         qipan=np.copy(Ob)
         if flag=='White':
             qipan[0,ation]=1
         else:
             qipan[0,ation]=2
         return qipan


    def render(self):
        self.update()

    def transfore(self,observation):
        # print(np.shape(shape)[1])
        s1=observation[0,:space]
        s2=observation[0,space:]
        s=np.hstack((s1,s2))
        return s

    #将棋盘1*361转化为1*722形式
    def getdouble(self,qipan):
        w_qipan=np.zeros([1,space])
        b_qipan=np.zeros([1,space])
        w_array=np.where(qipan==1)[1]
        b_array=np.where(qipan==2)[1]
        w_qipan[0,w_array]=1
        b_qipan[0,b_array]=1
        s=np.hstack((w_qipan,b_qipan))  #转化为1*722矩阵,前361是白字的状态,后361是黑子的状态
        return s
示例#9
0
class Trainer(object):
    def __init__(self):
        start_table = dict()
        end_table = dict()
        self.RL = DeepQNetwork(n_actions,
                               n_features,
                               learning_rate=0.01,
                               reward_decay=0.9,
                               e_greedy=0.9,
                               replace_target_iter=200,
                               memory_size=2000,
                               output_graph=False,
                               testing=False)

        filename = "test_destinations.txt"
        f = open(filename, "r")

        for line in f:
            nums = line.split(';')
            start = nums[0].split(',')
            end_ = nums[1].split(',')

            start = [0, 0]
            end = [0, 0]
            start[0] = int(start[0])
            start[1] = int(start[1])
            end[0] = int(end_[0])
            end[1] = int(end_[1])

            start_table[start[0]] = start[1]
            end_table[end[0]] = end[1]

        # Training Time keeping
        total_time = 0
        start = time.time()

        # train on 25 samples
        self.run_training(150, start_table, end_table)

        # Training Time keeping
        total_time = (time.time() -
                      start) / 60  # print minutes to train on 100 samples
        time_file = "trainTime.txt"
        f = open(time_file, "w+")
        f.write(str(total_time))
        f.close()

    def run_training(self, training_samples, start_table, end_table):
        # Train over multiple instances
        map_file = np.loadtxt('map.txt', dtype=int)
        # bounding negative values for keeping it in bounds
        map_file[0, :] = MIN_VALUE
        map_file[:, 0] = MIN_VALUE
        map_file[:, len(map_file) - 1] = MIN_VALUE
        map_file[len(map_file) - 1, :] = MIN_VALUE

        for sample_x in range(training_samples):
            start = [
                random.randint(1, IMG_SIZE - 1),
                random.randint(1, IMG_SIZE - 1)
            ]
            end = [
                random.randint(1, IMG_SIZE - 1),
                random.randint(1, IMG_SIZE - 1)
            ]

            # query dictionary
            start_ = start_table.get(start[0], -1)
            end_ = end_table.get(end[0], -1)

            # ensure different than test cases
            while (start_ == start[1] and end_ == end[1]):
                start = [
                    random.randint(1, IMG_SIZE - 1),
                    random.randint(1, IMG_SIZE - 1)
                ]
                end = [
                    random.randint(1, IMG_SIZE - 1),
                    random.randint(1, IMG_SIZE - 1)
                ]
                start_ = start_table.get(start[0], -1)
                end_ = end_table.get(end[0], -1)

            total_epochs = 300

            # UAV map emulation
            env = Map(start, end, sample_x, map_file, False)
            self.run_map(str(sample_x), env, total_epochs)

            print("Finished training", sample_x)
        print("done training")

        # Save model here

    def run_map(self, i, env, epochs):
        step = 0
        s = []
        for episode in range(epochs):
            print("starting epoch ", episode)
            # initial observation
            observation = env.reset(str(episode))
            count = 0
            while True:
                count += 1
                # RL choose action based on observation
                action = self.RL.choose_action(observation)

                # RL take action and get next observation and reward
                observation_, reward, done = env.step(action)

                self.RL.store_transition(observation, action, reward,
                                         observation_)

                if ((step > 200) and (step % 5 == 0)) or done:
                    self.RL.learn(done)

                # swap observation
                observation = observation_

                # break while loop when end of this episode
                if done:
                    break
                step += 1
            s.append(count)

        plt.plot(np.arange(len(s)), s)
        plt.ylabel('points to goal')
        plt.xlabel('training steps')

        folder = "../DQN_path/graphs/"
        figname = folder + i + "_figPtsv1.png"
        plt.savefig(figname)
        plt.clf()