def main(): env = RideHitch("data/norm1000.txt") print(env.requests_list) RL = DeepQNetwork(env.pool_size, env.state_num, learning_rate=0.01, reward_decay=0.99, e_greedy=1, replace_target_iter=200, memory_size=2000, output_graph=False, T=env.T_threshold, D=env.D_threshold) step = 0 matched_list = [] for episode in range(100): # init observation = env.reset(reset_seq=False) # if episode % 100 == 0: # print(episode) matched = 0 print("seq size:", env.request_num, "pool size:", env.pool_size) while True: action = RL.choose_action(observation) observation_, reward, done = env.step(action) if reward > 0: matched += 1 RL.store_transition(observation, action, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() observation = observation_ if done: break step += 1 matched_list.append(matched) print("eps", episode, "matching", matched) # print(matched_list) RL.plot_cost()
value = 0 for step in range(STEP): state = env.state.copy() action = RL.choose_action(state) env.step(action_space[action]) state_ = env.state.copy() reward, dist = compute_reward(state, state_) RL.store_transition(state, action, reward, state_) value += reward if dist < DIST: break if RL.memory_counter > MEMORYCAPACITY: RL.learn() if (episode + 1) % 100 == 0: env.display2D() print episode + 1 print value if (dist < DIST) & (dist > 0): print "Got Target" if dist < 0: print "Got Obstacle" if dist > DIST: print "Failed Target" print '*' * 40 if (episode + 1) % 10000 == 0: RL.save_model()
if i >= 1: RL_farm.store_transition(observation_1_last, action1, reward_stage1_old, observation_1_new) RL_server[farm_id].store_transition(observation_2_last, action2, reward_stage2_old, observation_2_new) observation_1_last = observation_1_new observation_2_last = observation_2_new reward_stage1_old = reward_stage1_new reward_stage2_old = reward_stage2_new RL_server_n[server_id] += 1 last_input_time = curr_task.input_time if (i > 200) and (i % 5 == 0): RL_farm.learn() if (RL_server_n[server_id] > 200) and (server_id % 5 == 0): RL_server[server_id].learn() if drop: print("task " + str(i) + " drop") else: print("task " + str(i) + ": farm " + str(farm_id) + " server: " + str(server_id)) print("----------------------") if i == 999: print("total_cost") print(total_cost.total_price(it))
child2=child2, child3=child3) kind = 3 DQN = DeepQNetwork(double_q=False, dueling_q=False, env=str(envs)) steps = 0 for i in range(30): start = time.time() support.create_csv(Env.save_title, kind=kind, i=i + 1) s = Env.reset() while not Env.done: action = DQN.choose_actions(s) s_, r, done, advise = Env.step(action) DQN.store_transition(s, action, r, s_) s = s_ if steps > 100: DQN.learn() Env.loss = DQN.cost steps += 1 Env.steps += 1 if Env.steps != 0: support.save_data2csv(Env.save_data, kind=kind, i=i + 1) if Env.steps >= max_steps: break support.save_data2csv_end(kind=kind, i=i + 1) support.save_fig(kind=kind, i=i + 1) DQN.store_results( support.root + support.child1[0] + support.child2[kind] + support.child3[1] + str(i + 1) + "th/" + "models", i + 1) DQN.store_graph(support.root + support.child1[0] + support.child2[kind] + support.child3[2] + str(i + 1) + "th/") print(
class LTDQN(Approach): def __init__(self, budget, times, users, n_scope, r_interval=0.01, isTrain=True): Approach.__init__(self, budget, times, users) self.n_scope = n_scope self.state_dim = 8 self.action_dim = 9 self.r_interval = r_interval if isTrain: self.dqn = DeepQNetwork(self.action_dim, self.state_dim) else: self.dqn = DeepQNetwork(self.action_dim, self.state_dim, e_greedy_increment=None) def generate_reward(self, action, user): if action == 1: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. elif action == 2: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. elif action == 3: user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 4: user.default_num -= 1 if user.default_num < 1: user.default_num = 1 elif action == 5: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 6: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. user.default_num -= 1 if user.default_num < 1: user.default_num = 1 elif action == 7: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 8: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. user.default_num -= 1 if user.default_num < 1: user.default_num = 1 def simulate(self): self.dqn.load() for ep in range(1): # self.users = self.init_users_list() total_benefits = 0. total_expense = 0. for time in range(self.times): total_affected_num = 0 total_req_num = 0. for user in self.users: if user.finished == 0: if self.budget > 0: output = self.dqn.choose_action(user.state) self.generate_reward(output, user) # if user.default_single_r >= 0.5: user.receive_offer(user.default_single_r, user.default_num, output) # else: # user.receive_offer(0, user.default_num, output) self.budget -= user.r else: user.receive_offer(0., 0, 0) total_req_num += user.req_num action, benefits, reward, done = user.choose_action() if done: if user.finished == 0: self.budget += user.r user.reset_status() # self.dqn.store_transition(user.state, action, reward, user.state_) user.state = user.state_.copy() # self.dqn.learn() if user.action == len(user.preference) - 1: total_affected_num += 1 if benefits > 0: total_benefits += benefits total_expense += benefits / (1. - benefits + 0.001) if (time + 1) % self.interval == 0: self.affected_users_num.append(total_affected_num) self.total_benefits.append(total_benefits) self.average_req_num.append(total_req_num / len(self.users)) self.ratio.append(total_expense) print( "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" % (ep, time, self.budget, total_benefits), end=' ') print() def init_users_list(self): user_list = [] arr = np.loadtxt('../dataset/test.txt', delimiter=' ') # arr = arr[0:2000, :] # train # print(arr) total_cost = 0. for row in range(arr.shape[0]): data = arr[row, :] # print(data[0]) user = User(row, float(data[0]), data[1:]) total_cost += user.preference[np.argmax( user.preference)] - user.preference[-1] user_list.append(user) print(len(user_list), total_cost) return user_list def train(self): for ep in range(500): self.budget = 50000 self.users = self.init_users_list() # self.users = self.init_users_list() self.dqn.epsilon = 0 total_benefits = 0. for time in range(self.times): total_affected_num = 0 total_req_num = 0. for user in self.users: if user.finished == 0: if self.budget > 0: output = self.dqn.choose_action(user.state) self.generate_reward(output, user) user.receive_offer(user.default_single_r, user.default_num, output) self.budget -= user.r else: user.receive_offer(0., 0, 0) total_req_num += user.req_num action, benefits, reward, done = user.choose_action() if done: if user.finished == 0: self.budget += user.r user.reset_status() # print(user.state_, user.state) self.dqn.store_transition(user.state, action, reward, user.state_) user.state = user.state_.copy() self.dqn.learn() if user.action == len(user.preference) - 1: total_affected_num += 1 if benefits > 0: total_benefits += benefits if (time + 1) % self.interval == 0: self.affected_users_num.append(total_affected_num) self.total_benefits.append(total_benefits) self.average_req_num.append(total_req_num / len(self.users)) print( "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" % (ep, time, self.budget, total_benefits), end=' ') if self.budget <= 0: break print() self.dqn.save()
class view(tkinter.Tk): def __init__(self): self.gameStart=False self.status=False self.reward=0 super(view, self).__init__() self.n_actions = 361 #定义动作的可能个数 self.n_features = 361 self.doneList=[] self.allphoto=[] self.initView() self.env=env() self.wobservation=None self.wobservation_=None self.action1=None self.RL = DeepQNetwork(self.n_actions, self.n_features ) def callback(self,event): if self.gameStart: mouse_x = event.x mouse_y = event.y if 590 > mouse_x > 20 and 590 > mouse_y > 20: # 横向为a,纵向为b a = round((mouse_x - 40) / 30) b = round((mouse_y - 40) / 30) action = b * 19 + a # self.env.qipan[b, a] = 2,非计算机方 observation =self.getdouble(np.reshape(np.copy(self.env.qipan), [1, space])) bobservation=self.transfore(observation) qipan,observation_, reward, done=self.step(action, 'Black') bobservation_=self.transfore(observation_) print('人工下棋的reward:%d'%reward) self.RL.store_transition(bobservation, action, reward*1.5, bobservation_) #此处默认人的掷棋是最优的 if done: tkinter.messagebox.showinfo(title='提示', message='you win!!!1') self.RL.learn(flag=2) self.RL.saveavarriable() self.RL.plot_cost() self.gameStart=False # self.status = True #计算机选择动作 self.bqipan=np.copy(self.env.qipan) wobservation = self.getdouble(np.reshape(self.bqipan,[1,space])) action1= self.RL.choose_action(self.bqipan,wobservation) #这里让电脑选择下一步下 bqipan_,wobservation_,reward,done=self.step(action1,'White') print('计算机下棋的reward:%d'%reward) self.RL.store_transition(observation, action, reward, observation_) if done: tkinter.messagebox.showinfo(title='提示', message='you failure') self.RL.saveavarriable() self.RL.plot_cost() self.gameStart = False def initView(self): def buttonCallBack(): self.RL.getvarriable() self.gameStart = True if len(self.allphoto) > 0: for i in self.allphoto: self.w.delete(i) self.allphoto.clear() self.doneList.clear() observation = self.env.reset() self.master = Tk() self.master.title("五子棋") self.master.resizable(width=False, height=False) self.w = Canvas(self.master, bg="#FFFFF0", width=700, height=630) for c in range(40, 610, 30): # 竖向 x0, y0, x1, y1 = c, 40, c, 580 self.w.create_line(x0, y0, x1, y1) for r in range(40, 610, 30): x0, y0, x1, y1 = 40, r, 580, r self.w.create_line(x0, y0, x1, y1) Label(self.w, text=1, bg="#FFFFF0").place(x=5, y=5) x1 = 60 y1 = 5 for i in range(2, 20): Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1) x1 += 30 x1 = 5 y1 = 60 for i in range(2, 20): Label(self.w, text=i, bg="#FFFFF0").place(x=x1, y=y1) y1 += 30 Button(self.w, text="开始游戏", bg="yellow", activebackground="Black", command=buttonCallBack).place(x=610, y=500) self.w.bind("<Double-Button-1>", self.callback) self.w.pack() #self.master.mainloop() def show(self,action,flag): y=(action//19)*30+40 x=(action%19)*30+40 if flag=='Black': a=self.w.create_oval(x-14,y-14,x+14,y+14,fill="Black") elif flag=='White': a = self.w.create_oval(x-14, y-14, x+14, y+14, fill="White") self.allphoto.append(a) self.update() def setPosition(self,action,flag): if action in self.doneList: tkinter.messagebox.showinfo(title='提示', message='当前位置不可下') else: self.doneList.append(action) self.show(action,flag) def reset(self): if len(self.allphoto)>0: for i in self.allphoto: self.w.delete(i) self.allphoto.clear() self.doneList.clear() self.gameStart=False observation=self.env.reset() ob=self.getdouble(np.reshape(observation,[1,space])) return np.copy(self.env.qipan),ob ############################################# def step(self,action,flag): # 根据不同的掷棋方,返回reward # print(flag) # print('ation:%d'%action) p1 = self.env.pwb(flag) p2 = self.env.pwn(action, flag) # 走完后赢的可能性 # print('落子前所得分数%d'%p1) # print('落子后所得分数%d'%p2) s=p2-p1 # if s<=0: # self.reward=0 # elif 0<s<150: # self.reward=300 # elif 150<=s<800: # self.reward=500 # elif 800<=s<3500: # self.reward=2000 # elif 3500<=s<4800: # self.reward=4000 # elif s>4800: # self.reward=6000 print("该步的回报值:%d"%s) self.setPosition(action,flag) if(s==-120): time.sleep(10000) qipan=self.getdouble(np.reshape(np.copy(self.env.qipan),[1,space])) return np.copy(self.env.qipan),qipan,s,self.env.done def tryPosition(self,Ob,ation,flag): qipan=np.copy(Ob) if flag=='White': qipan[0,ation]=1 else: qipan[0,ation]=2 return qipan def render(self): self.update() def transfore(self,observation): # print(np.shape(shape)[1]) s1=observation[0,:space] s2=observation[0,space:] s=np.hstack((s1,s2)) return s #将棋盘1*361转化为1*722形式 def getdouble(self,qipan): w_qipan=np.zeros([1,space]) b_qipan=np.zeros([1,space]) w_array=np.where(qipan==1)[1] b_array=np.where(qipan==2)[1] w_qipan[0,w_array]=1 b_qipan[0,b_array]=1 s=np.hstack((w_qipan,b_qipan)) #转化为1*722矩阵,前361是白字的状态,后361是黑子的状态 return s
class Trainer(object): def __init__(self): start_table = dict() end_table = dict() self.RL = DeepQNetwork(n_actions, n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=False, testing=False) filename = "test_destinations.txt" f = open(filename, "r") for line in f: nums = line.split(';') start = nums[0].split(',') end_ = nums[1].split(',') start = [0, 0] end = [0, 0] start[0] = int(start[0]) start[1] = int(start[1]) end[0] = int(end_[0]) end[1] = int(end_[1]) start_table[start[0]] = start[1] end_table[end[0]] = end[1] # Training Time keeping total_time = 0 start = time.time() # train on 25 samples self.run_training(150, start_table, end_table) # Training Time keeping total_time = (time.time() - start) / 60 # print minutes to train on 100 samples time_file = "trainTime.txt" f = open(time_file, "w+") f.write(str(total_time)) f.close() def run_training(self, training_samples, start_table, end_table): # Train over multiple instances map_file = np.loadtxt('map.txt', dtype=int) # bounding negative values for keeping it in bounds map_file[0, :] = MIN_VALUE map_file[:, 0] = MIN_VALUE map_file[:, len(map_file) - 1] = MIN_VALUE map_file[len(map_file) - 1, :] = MIN_VALUE for sample_x in range(training_samples): start = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] end = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] # query dictionary start_ = start_table.get(start[0], -1) end_ = end_table.get(end[0], -1) # ensure different than test cases while (start_ == start[1] and end_ == end[1]): start = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] end = [ random.randint(1, IMG_SIZE - 1), random.randint(1, IMG_SIZE - 1) ] start_ = start_table.get(start[0], -1) end_ = end_table.get(end[0], -1) total_epochs = 300 # UAV map emulation env = Map(start, end, sample_x, map_file, False) self.run_map(str(sample_x), env, total_epochs) print("Finished training", sample_x) print("done training") # Save model here def run_map(self, i, env, epochs): step = 0 s = [] for episode in range(epochs): print("starting epoch ", episode) # initial observation observation = env.reset(str(episode)) count = 0 while True: count += 1 # RL choose action based on observation action = self.RL.choose_action(observation) # RL take action and get next observation and reward observation_, reward, done = env.step(action) self.RL.store_transition(observation, action, reward, observation_) if ((step > 200) and (step % 5 == 0)) or done: self.RL.learn(done) # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 s.append(count) plt.plot(np.arange(len(s)), s) plt.ylabel('points to goal') plt.xlabel('training steps') folder = "../DQN_path/graphs/" figname = folder + i + "_figPtsv1.png" plt.savefig(figname) plt.clf()
HE_soc = [] HP_soc = [] else: print(time) Env_battery_update = Env_battery(final_data[0:time], HE_power_vector , HP_power_vector, HEcur, HPcur, speed[0:time]) #RL = DeepQNetwork(Env_battery_update.n_actions, Env_battery_update.n_states, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000) action = RL.choose_action(observation) # RL take action and get next observation and reward observation_, reward, HE_power_vector, HP_power_vector, HEcur, HPcur = Env_battery_update.step(action, time-1) #####time stamp of action print("reward", reward) reward_total = reward_total + reward RL.store_transition(observation, action, reward, observation_) cost = RL.learn() cost_total = cost_total + cost # swap observation observation = observation_ HE_soc.append(observation[0]) HP_soc.append(observation[1]) # break while loop when end of this episode # Env_battery.mainloop() #RL.plot_cost() cost_final.append(cost_total) reward_final.append(reward_total) plt.plot(np.arange(len(cost_final)),cost_final) plt.ylabel('Cost') plt.xlabel('Epoch ') plt.show()