def run_testcase(filename): # find destinations in folder starting_... and ending_... finder = findDestinations(filename) end = finder.returnDestination() start = finder.returnStarting() map_file = np.loadtxt('map.txt',dtype=int) # bounding negative values for keeping it in bounds map_file[0,:] = MIN_VALUE map_file[:,0] = MIN_VALUE map_file[:,len(map_file)-1]=MIN_VALUE map_file[len(map_file)-1,:]=MIN_VALUE # UAV map emulation env = Map(start,end,filename,map_file) RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=True, iteration=filename ) run_map(filename,RL,env) RL.plot_cost() compare = Compare(filename) #compare to given results print("Finished iteration", filename)
def __init__(self): self.gameStart=False self.status=False self.reward=0 super(view, self).__init__() self.n_actions = 361 #定义动作的可能个数 self.n_features = 361 self.doneList=[] self.allphoto=[] self.initView() self.env=env() self.wobservation=None self.wobservation_=None self.action1=None self.RL = DeepQNetwork(self.n_actions, self.n_features )
def train(**kwargs): for k_, v_ in kwargs.items(): setattr(opt, k_, v_) RL = DeepQNetwork(env.n_actions, env.n_features, opt) env.after(100, update_dqn(RL)) env.mainloop()
def __init__(self): start_table = dict() end_table = dict() self.RL = DeepQNetwork(n_actions, n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=False, testing=False) filename = "test_destinations.txt" f = open(filename, "r") for line in f: nums = line.split(';') start = nums[0].split(',') end_ = nums[1].split(',') start = [0, 0] end = [0, 0] start[0] = int(start[0]) start[1] = int(start[1]) end[0] = int(end_[0]) end[1] = int(end_[1]) start_table[start[0]] = start[1] end_table[end[0]] = end[1] # Training Time keeping total_time = 0 start = time.time() # train on 25 samples self.run_training(150, start_table, end_table) # Training Time keeping total_time = (time.time() - start) / 60 # print minutes to train on 100 samples time_file = "trainTime.txt" f = open(time_file, "w+") f.write(str(total_time)) f.close()
def __init__(self, budget, times, users, n_scope, r_interval=0.01, isTrain=True): Approach.__init__(self, budget, times, users) self.n_scope = n_scope self.state_dim = 8 self.action_dim = 9 self.r_interval = r_interval if isTrain: self.dqn = DeepQNetwork(self.action_dim, self.state_dim) else: self.dqn = DeepQNetwork(self.action_dim, self.state_dim, e_greedy_increment=None)
def DQN(): import tensorflow as tf from DQN import DeepQNetwork import numpy as np game.restart_game() tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) dqn = DeepQNetwork(sess, game) game_state = game.current_state() start_state = np.concatenate( (game_state, game_state, game_state, game_state), axis=2) s_t = start_state while not game.game_end(): # choose an action epsilon greedily _, action_index = dqn.choose_action(s_t) move = action_index game.do_move(move) pygame.event.pump() game_state = game.current_state() s_t = np.append(game_state, s_t[:, :, :-2], axis=2) screen.fill(black) game.snake.blit(rect_len, screen) game.strawberry.blit(screen) game.blit_score(white, screen) pygame.display.flip() fpsClock.tick(15) crash()
def __init__(self, input_dims, n_actions, lr, mem_size, batch_size, epsilon, gamma=0.99, eps_dec=5e-7, eps_min=0.01, replace=1000, algo=None, env_name=None, checkpoint_dir='tmp/dqn'): self.lr = lr self.batch_size = batch_size self.input_dims = input_dims self.n_actions = n_actions self.gamma = gamma self.epsilon = epsilon self.eps_dec = eps_dec self.eps_min = eps_min self.replace = replace self.algo = algo self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + " " + self.algo + "_q_eval", checkpoint_dir=self.checkpoint_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + " " + self.algo + "_q_next", checkpoint_dir=self.checkpoint_dir)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7, replace = 10_000): pass self.gamma = gamma #used to discount future rewards self.epsilon = epsilon #used for epsilon-greedy action choosing algo. self.lr = lr #learning rate, essentially, how big of a step does the optimizer take self.n_actions = n_actions #number of actions available to our agent in its environment self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment self.mem_size = mem_size #maximum amount of memories to store self.batch_size = batch_size #mini-batch size to sample from memory. self.eps_min = eps_min #smallest possible epsilon value for our agent self.eps_dec = eps_dec #how much to decrease epsilon each iteration self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network self.steps = 0 #iteration counter for use with replace_after #create a ReplayBuffer to store our memories, also used to sample a mini-batch self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.Q_local = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims) self.Q_target = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims)
def main(): env = RideHitch("data/norm1000.txt") print(env.requests_list) RL = DeepQNetwork(env.pool_size, env.state_num, learning_rate=0.01, reward_decay=0.99, e_greedy=1, replace_target_iter=200, memory_size=2000, output_graph=False, T=env.T_threshold, D=env.D_threshold) step = 0 matched_list = [] for episode in range(100): # init observation = env.reset(reset_seq=False) # if episode % 100 == 0: # print(episode) matched = 0 print("seq size:", env.request_num, "pool size:", env.pool_size) while True: action = RL.choose_action(observation) observation_, reward, done = env.step(action) if reward > 0: matched += 1 RL.store_transition(observation, action, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() observation = observation_ if done: break step += 1 matched_list.append(matched) print("eps", episode, "matching", matched) # print(matched_list) RL.plot_cost()
MAX_EPISODES = 900 ON_TRAIN = True # set env env = DataEnv() s_dim = env.state_dim a_dim = env.action_dim clf = Classifier('logistic') # set RL method rl = DeepQNetwork( a_dim, s_dim, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) steps = [] def train(): # start training for i in range(MAX_EPISODES): state = env.reset(clf) ep_r = 0. while True: # env.render()
Tensorflow: 1.0 gym: 0.8.0 """ from wsn_env import Env from DQN import DeepQNetwork import matplotlib.pyplot as plt import numpy as np env = Env() RL = DeepQNetwork( n_actions=env.n_actions, n_features=env.n_features, learning_rate=0, e_greedy=0, replace_target_iter=300, memory_size=3000, e_greedy_increment=0.0002, ) total_steps = 0 reware = [] for i_episode in range(env.times): observation = env.reset() ep_r = 0 while True: action = RL.choose_action(observation)
if once == False: once = True print "break out of the while loop" print DQN.epsilon print DQN.learn_step_counter break time.sleep(3.0) """ global DQN DQN = DeepQNetwork( n_actions, n_features, learning_rate=0.03, reward_decay=0.9, replace_target_iter=150, memory_size=1000, # output_graph=True ) t = threading.Thread(target=run) t.daemon = True t.start() t.join start_simulation() DQN.plot_q_t() DQN.plot_cost() plot_values = [] accumulation = 0 for i in range(len(scores)):
if __name__ == '__main__': rewards = [[], []] # makespan agent 和 cost agent的奖励值 records = [[], [], []] # makespan agent 和 cost agent的数值以及策略集合 scaler = StandardScaler() env = Env(N_AGENT) memories = [Memory(MEMORY_SIZE) for i in range(N_AGENT)] memory = Memory(MEMORY_SIZE) dqn = [ DeepQNetwork(env.n_actions, env.n_features, i, learning_rate=0.0001, replace_target_iter=REPLACE_TARGET_ITER, e_greedy_increment=2e-5) for i in range(N_AGENT) ] run_env() fig, (ax0, ax1) = plt.subplots(nrows=2) ax0.grid(True) ax0.set_xlabel('episodes') ax0.set_ylabel('makespan metric') line01, = ax0.plot(rewards[0], color='orange', label="rewards", linestyle='-') # line02, = ax0.plot(records[0], label = "records", linewidth=2)
max_steps = 1000 Env = env() root = "./Result/" child0 = mytyme child1 = ['Datas/', "Src/"] child2 = ["DQN/", 'Double-DQN/', 'Dueling-DQN/', "Double-Dueling-DQN/"] child3 = ["data/", "map/", "memory/"] kind = 0 envs = 1 support = Support(root=root, child0=child0, child1=child1, child2=child2, child3=child3) kind = 3 DQN = DeepQNetwork(double_q=False, dueling_q=False, env=str(envs)) steps = 0 for i in range(30): start = time.time() support.create_csv(Env.save_title, kind=kind, i=i + 1) s = Env.reset() while not Env.done: action = DQN.choose_actions(s) s_, r, done, advise = Env.step(action) DQN.store_transition(s, action, r, s_) s = s_ if steps > 100: DQN.learn() Env.loss = DQN.cost steps += 1 Env.steps += 1
if done: break step += 1 s.append[count] plt.plot(np.arange(len(s)), s) plt.ylabel('points to goal') plt.xlabel('training steps') plt.savefig("figPtsv1.png") total_time = start - time.time() f = open("trainTime.txt", "w+") f.write(total_time) f.close() print('Finished') if __name__ == "__main__": # maze game env = Map() RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=True) run_map() RL.plot_cost()
from DQN import DeepQNetwork import gym env = gym.make("CartPole-v0") env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], # (4,)[0] learning_rate=.01, e_greedy=.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=.001, ) total_steps = 0 for episode in range(1000): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation)
if __name__ == "__main__": n_serverfarms = 20 n_servers = 15 n_vms = 5 env = e.cloud_env(n_serverfarms, n_servers, n_vms) df_task_usage = read_data.read_data() job_queue = {} RL_farm = DeepQNetwork( n_serverfarms * 24, n_serverfarms * n_servers * 2 + 4, '_farm', learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) RL_server = [] for i in range(n_serverfarms): dqn = DeepQNetwork( n_servers, n_servers * 2 + 4, '_server_' + str(i), learning_rate=0.01, reward_decay=0.9, e_greedy=0.9,
t = t + 1 else: reward = match.matching(graph.left, graph.right, graph.edge) print("r", reward) l_num = len(graph.left) r_num = len(graph.right) l = 0 state_ = np.array([ l_num, r_num, match.fake_matching(graph.left, graph.right, graph.edge), l ]) RL.store_transition(state, action, reward, state_) t = t + 1 if (t > 200) and (t % 5 == 0): print("sss") RL.learn() all_reward = reward + all_reward print("得分", all_reward, "轮数", t) if __name__ == '__main__': RL = DeepQNetwork(2, 4, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=True) run_net()
class DDQNAgent(): """ A Double DQN agent has two networks. One local network and one target network. The local network is trained every iteration and is used for predictive action. The target network is updated to a soft copy of the local network every so often. The reason is because the Bellman equation would be valuing the network that is predicting as well as that same network being used to calculate loss. We have this separation of training and predicting to help the agent learn. """ def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7, replace = 10_000): pass self.gamma = gamma #used to discount future rewards self.epsilon = epsilon #used for epsilon-greedy action choosing algo. self.lr = lr #learning rate, essentially, how big of a step does the optimizer take self.n_actions = n_actions #number of actions available to our agent in its environment self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment self.mem_size = mem_size #maximum amount of memories to store self.batch_size = batch_size #mini-batch size to sample from memory. self.eps_min = eps_min #smallest possible epsilon value for our agent self.eps_dec = eps_dec #how much to decrease epsilon each iteration self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network self.steps = 0 #iteration counter for use with replace_after #create a ReplayBuffer to store our memories, also used to sample a mini-batch self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.Q_local = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims) self.Q_target = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims) def store_memory(self, state, action, reward, state_, done): """ Save a new memory to our ReplayBuffer """ self.memory.store_memory(state, action, reward, state_, done) def sample_batch(self): """ Pull a stochastic mini-batch from our ReplayBuffer """ state, action, reward, state_, done = \ self.memory.sample_batch(self.batch_size) states = T.tensor(state).to(self.Q_local.device) actions = T.tensor(action).to(self.Q_local.device) rewards = T.tensor(reward).to(self.Q_local.device) states_ = T.tensor(state_).to(self.Q_local.device) dones = T.tensor(done).to(self.Q_local.device) return states, actions, rewards, states_, dones def choose_action(self, observation): """ Choose an action from our action space using an epsilon-greedy algorithm. We can either EXPLOIT, or EXPLORE based on a random probability. Exploiting will choose the best known action. (confidence) Exploring will explore a random action. This will possibly present new information to our agent to learn from. """ if np.random.random() > self.epsilon:#epsilon-greedy (EXPLOIT) state = T.tensor([observation], dtype = T.float).to(self.Q_local.device) actions = self.Q_local.forward(state) action = T.argmax(actions).item()#.item() gets index from tensor else:#(EXPLORE) action = np.random.choice(self.action_space)#choose random action from our action space return action def replace_target_network(self): """ after replace_after iterations we update our target network to be a soft copy of our local network """ if self.replace_after is not None and \ self.steps % self.replace_after == 0: self.Q_target.load_state_dict(self.Q_local.state_dict()) def decrement_epsilon(self): """ decrease epsilon, but not below eps_min """ self.epsilon = max(self.epsilon - self.eps_dec, self.eps_min) def learn(self): """ Main part of our agent. First we zero the gradient of our optimzier to stop exploding gradients. Then we sample a stochastic mini-batch from our ReplayBuffer. Then we make predictions and evaluations of this random mini-batch, step our optimzer and calculate loss. Finally, we decrement our epsilon and begin the cycle of (SEE->DO->LEARN) once again. """ if self.memory.mem_cntr < self.batch_size:#if we dont have a full batch of memories, dont learn quite yet return self.Q_local.optimizer.zero_grad()#zero out our gradient for optimzer. Stop exploding gradients self.replace_target_network() states, actions, rewards, states_, dones = self.sample_batch() indices = np.arange(self.batch_size) q_pred = self.Q_local.forward(states)[indices, actions]#local pred q_next = self.Q_target.forward(states_)#target pred q_eval = self.Q_local.forward(states_) max_actions = T.argmax(q_eval, dim = 1) q_next[dones] = 0.0#set to not done q_target = rewards + self.gamma*q_next[indices, max_actions]#bellman equation loss = self.Q_local.loss(q_target, q_pred).to(self.Q_local.device) loss.backward()#back-propagation self.Q_local.optimizer.step() self.steps += 1 self.decrement_epsilon() def save_agent(self): self.Q_local.save_model('local') self.Q_target.save_model('target') def load_agent(self): self.Q_local.load_model('local') self.Q_target.load_model('target')
config = load_config(config_path) reward_list = [] RL = None for n_agent in range(1, 13): for seed in range(1, 13): env = Select(params=config['env'], n_agent=n_agent, attack_mode=args.attack_mode) # 解决for循环模型重加载问题 tf.reset_default_graph() RL = DeepQNetwork(env.n_action, env.n_agent, seed=seed, learning_rate=args.learning_rate, reward_decay=args.reward_delay, # 更注重短期奖励还是长期奖励 replace_target_iter=args.replace_target_iter, memory_size=args.memory_size, output_graph=args.output_graph, n_input=n_agent, prioritized=args.prioritized ) reward_max = 0 observation_max = [] episode_max = 0 # reward_last = 0 # observation_last = [] # episode_last = 0 reward = 0 step = 0 print('===========' + ' start train! ' + '===========') print() for episode in range(args.max_episode):
matplotlib.use('Agg') import matplotlib.pyplot as plt import time import numpy as np import random # Global variables MIN_VALUE = -10000 IMG_SIZE = 300 action_space = ['u', 'd', 'l', 'r', 'ur', 'rd', 'ld', 'ul'] n_actions = len(action_space) n_features = 2 RL = DeepQNetwork(n_actions, n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=False, testing=True) def init_map(filename): map_file = np.loadtxt('map.txt', dtype=int) # bounding negative values for keeping it in bounds map_file[0, :] = MIN_VALUE map_file[:, 0] = MIN_VALUE map_file[:, len(map_file) - 1] = MIN_VALUE map_file[len(map_file) - 1, :] = MIN_VALUE return map_file
print(str(env)) brain_name = env.external_brain_names[0] tf.reset_default_graph() summary_path = "./summaries/{}".format(run_path) if not os.path.exists(summary_path): os.makedirs(summary_path) ##Q2 Start RL = DeepQNetwork(4, 8, learning_rate=0.001, reward_decay=0.99, e_greedy=0.975, replace_target_iter=4, memory_size=10000, e_greedy_increment=None) ##Q2 End init = tf.global_variables_initializer() config = tf.ConfigProto() config.gpu_options.allow_growth = True summary_writer = tf.summary.FileWriter(summary_path) def PrintAndSaveSummary(writer, episode, episodeStep, episodeReward, epsilon, lr): print('Epsisode:', episode, 'length =', episodeStep)
print( 'episode:' + str(episode) + ' steps:' + str(step) + ' reward:' + str( rwd) + ' eps_greedy:' + str( dqn.epsilon)) rewards.append(rwd) break if __name__ == '__main__': rewards = [] env = Env(N_VM) memories = Memory(MEMORY_SIZE) dqn = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.001, replace_target_iter=200, e_greedy_increment=3e-5 ) run_env(EPISODES, MINI_BATCH) dqn.plot_cost() plt.plot(np.arange(len(rewards)), rewards) plt.plot(np.arange(len(rewards)), [138 for i in range(len(rewards))]) plt.ylabel('reward') plt.xlabel('episode') plt.show()
return action[0] # DQN def action_transform(pp_obser): return [pp_obser['pp'], pp_obser['n']] env = env.unwrapped print(env.action_space) print(env.observation_space) #print(env.observation_space.high) #print(env.observation_space.low) RL = DeepQNetwork(n_actions=env.action_space.n, n_features=2, learning_rate=alpha, reward_decay=gamma, e_greedy=0.9, replace_target_iter=100, memory_size=2000, e_greedy_increment=0.01,) total_steps = 0 best_reward=0 best_pp=None reward_list=[] for episode in range(nEpisodes): # DQN observation, info = env.reset() # frame=info["frame"] ep_r = 0
# break while loop when end of this episode if RL.memory_counter > MEMORY_CAPACITY: RL.learn() if done: break step += 1 state = state_ print('episode is:', episode) # end of game print('training over') # end of game print('game over') if __name__ == "__main__": size = 12 m = 6 MEMORY_CAPACITY = 4000 board = Board(size, m) RL = DeepQNetwork(size**2, size**2) train(1) RL.save_net() print('Finish')
observation_, reward, done = env.step(action) RL.store_transition(observation, action, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') # env.destroy() if __name__ == "__main__": env = Environment(rule=Rule()) RL = DeepQNetwork(list(range(41)), 2, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) main() RL.plot_cost()
return 1.0 + PENALTY, dist_ if dist_ > dist: return -1.0 + PENALTY, dist_ return 0.0 + PENALTY, dist_ if __name__ == "__main__": # maze game env = Env() RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.0001, reward_decay=0.9, e_greedy=0.75, replace_target_iter=2000, memory_size=MEMORYCAPACITY, batch_size=64 # output_graph=True ) RL.restore_model() for episode in range(EPS): env.build_map() value = 0 for step in range(STEP): state = env.state.copy() action = RL.choose_action(state) env.step(action_space[action]) state_ = env.state.copy() reward, dist = compute_reward(state, state_)
class LTDQN(Approach): def __init__(self, budget, times, users, n_scope, r_interval=0.01, isTrain=True): Approach.__init__(self, budget, times, users) self.n_scope = n_scope self.state_dim = 8 self.action_dim = 9 self.r_interval = r_interval if isTrain: self.dqn = DeepQNetwork(self.action_dim, self.state_dim) else: self.dqn = DeepQNetwork(self.action_dim, self.state_dim, e_greedy_increment=None) def generate_reward(self, action, user): if action == 1: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. elif action == 2: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. elif action == 3: user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 4: user.default_num -= 1 if user.default_num < 1: user.default_num = 1 elif action == 5: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 6: user.default_single_r += self.r_interval if user.default_single_r > 1.: user.default_single_r = 1. user.default_num -= 1 if user.default_num < 1: user.default_num = 1 elif action == 7: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. user.default_num += 1 if user.default_num > self.n_scope: user.default_num = self.n_scope elif action == 8: user.default_single_r -= self.r_interval if user.default_single_r < 0.: user.default_single_r = 0. user.default_num -= 1 if user.default_num < 1: user.default_num = 1 def simulate(self): self.dqn.load() for ep in range(1): # self.users = self.init_users_list() total_benefits = 0. total_expense = 0. for time in range(self.times): total_affected_num = 0 total_req_num = 0. for user in self.users: if user.finished == 0: if self.budget > 0: output = self.dqn.choose_action(user.state) self.generate_reward(output, user) # if user.default_single_r >= 0.5: user.receive_offer(user.default_single_r, user.default_num, output) # else: # user.receive_offer(0, user.default_num, output) self.budget -= user.r else: user.receive_offer(0., 0, 0) total_req_num += user.req_num action, benefits, reward, done = user.choose_action() if done: if user.finished == 0: self.budget += user.r user.reset_status() # self.dqn.store_transition(user.state, action, reward, user.state_) user.state = user.state_.copy() # self.dqn.learn() if user.action == len(user.preference) - 1: total_affected_num += 1 if benefits > 0: total_benefits += benefits total_expense += benefits / (1. - benefits + 0.001) if (time + 1) % self.interval == 0: self.affected_users_num.append(total_affected_num) self.total_benefits.append(total_benefits) self.average_req_num.append(total_req_num / len(self.users)) self.ratio.append(total_expense) print( "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" % (ep, time, self.budget, total_benefits), end=' ') print() def init_users_list(self): user_list = [] arr = np.loadtxt('../dataset/test.txt', delimiter=' ') # arr = arr[0:2000, :] # train # print(arr) total_cost = 0. for row in range(arr.shape[0]): data = arr[row, :] # print(data[0]) user = User(row, float(data[0]), data[1:]) total_cost += user.preference[np.argmax( user.preference)] - user.preference[-1] user_list.append(user) print(len(user_list), total_cost) return user_list def train(self): for ep in range(500): self.budget = 50000 self.users = self.init_users_list() # self.users = self.init_users_list() self.dqn.epsilon = 0 total_benefits = 0. for time in range(self.times): total_affected_num = 0 total_req_num = 0. for user in self.users: if user.finished == 0: if self.budget > 0: output = self.dqn.choose_action(user.state) self.generate_reward(output, user) user.receive_offer(user.default_single_r, user.default_num, output) self.budget -= user.r else: user.receive_offer(0., 0, 0) total_req_num += user.req_num action, benefits, reward, done = user.choose_action() if done: if user.finished == 0: self.budget += user.r user.reset_status() # print(user.state_, user.state) self.dqn.store_transition(user.state, action, reward, user.state_) user.state = user.state_.copy() self.dqn.learn() if user.action == len(user.preference) - 1: total_affected_num += 1 if benefits > 0: total_benefits += benefits if (time + 1) % self.interval == 0: self.affected_users_num.append(total_affected_num) self.total_benefits.append(total_benefits) self.average_req_num.append(total_req_num / len(self.users)) print( "\rEpisode: %d, Time step: %d, Budget: %f, Benefits: %f" % (ep, time, self.budget, total_benefits), end=' ') if self.budget <= 0: break print() self.dqn.save()
# end print('over') sys.exit() if __name__ == "__main__": ###get environment #env = gym.make('HalfCheetah-v2')##HalfCheetah, Ant, Humanoid env = TwoLeggedEnv() #SixLeggedEnv() #env = myEnv() #self-defined enviornment ###initialize rl_agent state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] print(action_dim) if (isTrain): model_path = "models/dqn_two_legged" else: model_path = "models/dqn_two_legged_final" rl_agent = DeepQNetwork(model_path, action_dim, state_dim, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, output_graph=True) #parse rl_agent to run the environment run_ant(rl_agent)
from DQN import DeepQNetwork dqn = DeepQNetwork(atari_env='SpaceInvaders-v4', state_dimension=[88, 80, 3], action_dimension=6, train_step=4) dqn.run()