def run_DQN(self, seed_n, Exp, Double, Prioritized): ############## parameter 복사 ############## sess = self.sess dis = self.dis REPLAY_MEMORY = self.REPLAY_MEMORY replay_memory = self.replay_memory batch_size = self.batch_size size_action_batch = self.size_action_batch Game = self.Game save_epi = self.save_epi save_network = self.save_network max_episodes = self.max_episodes max_steps = self.max_steps env = self.env random_action = self.random_action input_size = self.input_size output_size = self.output_size alpha = self.alpha beta_init = self.beta_init beta_max_step = self.beta_max_step eps = self.eps eps_div = self.eps_div s_scale = self.s_scale training_step = self.training_step copy_step = self.copy_step action_copy_step = self.action_copy_step action_train = self.action_train weighted_train = self.weighted_train repu_num = self.repu_num DDPG = self.DDPG ending_cond_epis = self.ending_cond_epis ending_cond_reward = self.ending_cond_reward env.seed(seed_n) np.random.seed(seed_n) tf.set_random_seed(seed_n) random.seed(seed_n) ############################################# Q_Network = self.Q_Network A_batch = Q_Network.get_action_batch() if DDPG: Action_Network = self.Action_Network # DDPG Action Network 학습 시 사용되는 grad_inv 설정 action_max = np.array(env.action_space.high).tolist() action_min = np.array(env.action_space.low).tolist() action_bounds = [action_max, action_min] grad_inv = grad_inverter(sess, action_bounds) case_n = seed_n + 1 end_episode = 0 step_count_total = 0 global_step = 0 loss = 0 e = 1. replay_buffer = deque() Q_list = [] TD_buffer = deque() steps_list = [] step_avg_list = [] global_step_list = [] average_distance = [] rate_of_adjacent = [] print("") print("CASE {}".format(case_n)) print(" STATE DIM : {}, ACTION DIM : {}".format( input_size, self.action_dim)) print(" Exp : {}".format(Exp)) if DDPG: print(" Strategy : Double : {}, Prioritized : {}, DDPG : {}". format(Double, Prioritized, DDPG)) elif random_action: if action_train: print( " Strategy : Double : {}, Prioritized : {}, ACTION : RANDOM, ACTION TRAIN 'ON'" .format(Double, Prioritized)) else: print( " Strategy : Double : {}, Prioritized : {}, ACTION : RANDOM" .format(Double, Prioritized)) else: if action_train: print( " Strategy : Double : {}, Prioritized : {}, ACTION : DISCRETIZATION, ACTION TRAIN 'ON'" .format(Double, Prioritized)) else: print( " Strategy : Double : {}, Prioritized : {}, ACTION : DISCRETIZATION" .format(Double, Prioritized)) print("") for episode in range(1, max_episodes + 1): done = False step_count = 0 current_step = 0 cost = 0 state = env.reset() while not done: # 입실론 값 조정, 0.001미만이 될 시 더 이상 작아지지 않는다. if e > 0.001: #e = 1. / ((float(episode - 1) / eps_div) + 1) e = 1. / ((float(global_step) / eps_div) + 1) t4 = time.time() if DDPG: # DDPG true 시, 액션네트워크로부터 행동을 결정받음 action = Action_Network.evaluate_actor( np.reshape(state, [1, input_size]))[0] else: # DDPG false 시, state에 따른 각 행동 별 q 값을 get_q_batch로 받은 후 Exploration 방식에 따라 행동 결정 action0 = Exploration.choice_action(Exp, e, s_scale,\ np.reshape(Q_Network.get_q_batch(np.reshape(state,[1,-1])),[1,-1])[0]) action = A_batch[action0] next_state, reward, done, _ = env.step(action) step_count += reward global_step += 1 current_step += 1 # Prioritized 시 tree(replay_memory)에 저장, 아닐 시 랜덤으로 추출할 replay_beffer에 저장 if Prioritized: replay_memory.save_experience(state, action, reward, next_state, done) else: replay_buffer.append( (state, next_state, action, reward, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state if global_step <= beta_max_step: replay_memory.anneal_per_importance_sampling( global_step, beta_max_step) # training step마다 traing 실행 if global_step > batch_size and global_step % training_step == 0: for re in range( repu_num): # repu_num만큼 반복 training. 거의 1로 사용. if Prioritized: # replay_memory로부터 batch를 추출 idx, priorities, w_batch, experience = replay_memory.retrieve_experience( batch_size) minibatch = self.format_experience(experience) if DDPG: # DDPG true시 Q네트워크와 Action네트워크 모두 training errors, cost = Train.train_prioritized_DDPG( Q_Network, Action_Network, minibatch, w_batch, output_size, grad_inv) replay_memory.update_experience_weight( idx, errors) else: # DDPG false시 Q네트워크 training errors, cost, state_t_batch = Train.train_prioritized( Q_Network, minibatch, w_batch, Exp, s_scale, input_size, output_size, size_action_batch) replay_memory.update_experience_weight( idx, errors) # action_copy_step 마다 action set을 training, action_train이 false 시 RAS 알고리즘 if action_train and global_step % action_copy_step == 0: action_weight = [] if weighted_train: # WARAS 알고리즘 # weight 계산 for k in range(batch_size): state_t = np.reshape( state_t_batch[k], [1, -1]) q_batch = Q_Network.get_q_batch( state_t) q_batch = np.reshape( q_batch, [1, -1])[0] q_batch = q_batch * 10. max_q = np.max(q_batch) q_batch = np.exp(q_batch - max_q) action_weight.append(q_batch) else: # ARAS 알고리즘 # 모든 weight를 1로 설정 action_weight = np.ones( [batch_size, size_action_batch]) # weight 값을 이용한 Q네트워크 training Q_Network.train_weighted_actor( state_t_batch, action_weight) # target-action set을 update Q_Network.update_action_target_critic() A_batch = Q_Network.get_action_batch() t_A_batch = Q_Network.get_target_action_batch( ) """ # 거리가 가까운 action 쌍을 찾아 resampling A_batch, t_A_batch = self.realign_action_batch(A_batch, t_A_batch) Q_Network.realign_action_batch(A_batch, t_A_batch) A_batch = Q_Network.get_action_batch() t_A_batch = Q_Network.get_target_action_batch() """ else: # Prioritized가 아닐 시 랜덤하게 minibatch를 생성해 training minibatch = random.sample(replay_buffer, batch_size) if DDPG: cost = Train.train_DDPG( Q_Network, Action_Network, minibatch, output_size, grad_inv) else: cost, state_t_batch = Train.train( Q_Network, minibatch, Exp, s_scale, input_size, output_size, size_action_batch) # copy_step 마다 Q네트워크 업데이트 if global_step % copy_step == 0: if DDPG: # Update target Critic and actor network Q_Network.update_target_critic() Q_Network.update_action_target_critic() Action_Network.update_target_actor() else: Q_Network.update_target_critic() Q_Network.update_action_target_critic() steps_list.append(step_count) global_step_list.append(global_step) # Print the average of result if episode < ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / episode) if episode == ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / ending_cond_epis) if episode > ending_cond_epis: step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format( episode, round(step_avg_list[episode - 1], 3))) if DDPG: print (" ( Result : {}, Loss : {}, Steps : {}, Global Steps : {} )" #.format(round(step_count, 3), round(cost, 5), current_step, global_step)) .format(round(step_count, 3), 0, current_step, global_step)) elif Exp == 'epsilon' or Exp == 'sparsemax': print (" ( Result : {}, Loss : {}, Epsilon : {}, Steps : {}, Global Steps : {} )" #.format(round(step_count, 3), round(cost, 5), round(e, 4), current_step, global_step)) .format(round(step_count, 3), 0, round(e, 5), current_step, global_step)) else: print (" ( Result : {}, Loss : {}, Steps : {}, Global Steps : {} )" #.format(round(step_count, 3), round(cost, 5), current_step, global_step)) .format(round(step_count, 3), 0, current_step, global_step)) distance, per_of_sim, per_of_sim2 = self.get_action_variance( A_batch) print( " ( Action Batch :::: Distance : {}, Percent : {}%({}%) )" .format(distance, per_of_sim, per_of_sim2)) average_distance.append(distance) rate_of_adjacent.append(per_of_sim) # Save the networks if episode % save_epi == 0: file_case = str(case_n) if save_network: Q_Network.save_network(game_name=self.file_name + '_seed' + file_case, episode=episode, save_epi=save_epi) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() with open( '/home/minjae/Desktop/JOLP/' + 'Average_of_Distance_(' + self.file_name + '_seed' + file_case + ')', 'wb') as fout: pickle.dump(average_distance, fout) with open( '/home/minjae/Desktop/JOLP/' + 'Rate_of_Adjacent_(' + self.file_name + '_global_' + '_seed' + file_case + ')', 'wb') as fout2: pickle.dump(rate_of_adjacent, fout2) p_values = list(range(1, episode + 1)) q_values = average_distance[:] r_values = rate_of_adjacent[:] plt.plot(p_values, q_values, c='r') plt.title('Average of Distance between Actions') plt.grid(True) plt.show() plt.plot(p_values, r_values, c='b') plt.title('Rate of Adjacent Actions') plt.grid(True) plt.show() end_episode += 1 # 결과가 목표치를 달성하면 학습 중단 if step_avg_list[episode - 1] > ending_cond_reward: break # max_steps 만큼 학습되었으면 학습 중단 if global_step > max_steps: break print("--------------------------------------------------") print("--------------------------------------------------") # 목표치를 달성하여 학습 중단 시, 남은 episode 만큼 실행 for episode in range(end_episode + 1, max_episodes + 1): if global_step > max_steps: break s = env.reset() reward_sum = 0 done = False while not done: # 최대 Q 값을 나타내는 행동 선택 action = np.argmax( Q_Network.evaluate_critic( np.reshape(state, [1, input_size]))) if conti_action_flag: action = [action_map[action]] else: action = action state, reward, done, _ = env.step(action) reward_sum += reward global_step += 1 if done: steps_list.append(reward_sum) global_step_list.append(global_step) step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format( episode, round(step_avg_list[episode - 1], 3))) print(" ( Result : {} )".format( reward_sum)) if episode % save_epi == 0: file_case = str(case_n) if save_network: Q_Network.save_network(game_name=self.file_name + '_seed' + file_case, episode=episode, save_epi=save_epi) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() # parameter 저장 file_case = str(case_n) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) # 그래프 출력 x_values = list(range(1, len(step_avg_list) + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() with open( '/home/minjae/Desktop/JOLP/' + 'Average_of_Distance_(' + self.file_name + '_seed' + file_case + ')', 'wb') as fout: pickle.dump(average_distance, fout) with open( '/home/minjae/Desktop/JOLP/' + 'Rate_of_Adjacent_(' + self.file_name + '_global_' + '_seed' + file_case + ')', 'wb') as fout2: pickle.dump(rate_of_adjacent, fout2) p_values = list(range(1, episode + 1)) q_values = average_distance[:] r_values = rate_of_adjacent[:] plt.plot(p_values, q_values, c='r') plt.title('Average of Distance between Actions') plt.grid(True) plt.show() plt.plot(p_values, r_values, c='b') plt.title('Rate of Adjacent Actions') plt.grid(True) plt.show()
def run_DQN(self, case_n, seed_n, Exp, Double, Dueling, Prioritized): sess = self.sess dis = self.dis REPLAY_MEMORY = self.REPLAY_MEMORY replay_memory = self.replay_memory batch_size = self.batch_size Game = self.Game save_epi = self.save_epi max_episodes = self.max_episodes env = self.env input_size = self.input_size output_size = self.output_size alpha = self.alpha beta_init = self.beta_init eps = self.eps eps_div = self.eps_div s_scale = self.s_scale training_step = self.training_step copy_step = self.copy_step repu_num = self.repu_num ending_cond_epis = self.ending_cond_epis ending_cond_reward = self.ending_cond_reward conti_action_flag = self.conti_action_flag action_map = self.action_map env.seed(seed_n) np.random.seed(seed_n) tf.set_random_seed(seed_n) random.seed(seed_n) Q_Network = self.Q_Network end_episode = 0 step_count_total = 0 global_step = 0 loss = 0 replay_buffer = deque() Q_list = [] TD_buffer = deque() steps_list = [] step_avg_list = [] global_step_list = [] print("") print("CASE {}".format(case_n)) print(" STATE DIM : {}, ACTION DIM : {}".format( input_size, self.action_dim)) print(" Exp : {}".format(Exp)) print( " Strategy : Double : {}, Dueling : {}, Prioritized : {}".format( Double, Dueling, Prioritized)) t = t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = 0 for episode in range(1, max_episodes + 1): t1 = time.time() #print( "TIME {} --- EPISODE {}".format(t1-t,episode)) t = t1 done = False step_count = 0 current_step = 0 cost = 0 state = env.reset() while not done: t7 = time.time() e = 1. / ((float(episode - 1) / eps_div) + 1) action = Exploration.choice_action( Exp, e, s_scale, Q_Network.evaluate_critic( np.reshape(state, [1, input_size]))[0]) if conti_action_flag: action0 = [action_map[action]] else: action0 = action next_state, reward, done, _ = env.step(action0) step_count += reward global_step += 1 current_step += 1 #t2 = time.time() #print( "TIME {} --- 1 {} {}".format(t2-t7,episode,current_step)) if Prioritized: replay_memory.save_experience(state, action, reward, next_state, done) else: replay_buffer.append( (state, next_state, action, reward, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() #t2 = time.time() #print( "TIME {} --- 2 {} {}".format(t2-t7,episode,current_step)) state = next_state replay_memory.anneal_per_importance_sampling( global_step, max_episodes * 1000) if global_step > batch_size and global_step % training_step == 0: for re in range(repu_num): minibatch = [] if Prioritized: idx, priorities, w_batch, experience = replay_memory.retrieve_experience( batch_size) minibatch = self.format_experience( experience, minibatch) errors, cost = Train.train_prioritized( Q_Network, minibatch, w_batch, Exp, s_scale, input_size, output_size) #print(errors) #t2 = time.time() #print( "TIME {} --- 3 {} {}".format(t2-t7,episode,current_step)) """ errors = [] for i in range(batch_size): state_m, next_state_m, action_m, reward_m, done_m = minibatch[i] q_t = np.max(Q_Network.evaluate_critic(np.reshape(state_m, [1, input_size]))) q_t_1 = np.max(Q_Network.evaluate_critic(np.reshape(next_state_m, [1, input_size]))) if done_m: q_t_1 = reward_m else: q_t_1 = reward_m + dis*q_t_1 errors.append(q_t_1-q_t) """ #errors = Train.train_error(Q_Network, minibatch, Exp, s_scale, input_size, output_size) #t2 = time.time() #print( "TIME {} --- 4 {} {}".format(t2-t7,episode,current_step)) replay_memory.update_experience_weight(idx, errors) else: minibatch = random.sample(replay_buffer, batch_size) Train.train(Q_Network, minibatch, Exp, s_scale, input_size, self.action_dim) if global_step > batch_size and global_step % copy_step == 0: Train.copy(Q_Network) #t8 = time.time() #print( "TIME {} --- CYCLE {} {}".format(t8-t7,episode,current_step)) steps_list.append(step_count) global_step_list.append(global_step) # Print the average of result if episode < ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / episode) if episode == ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / ending_cond_epis) if episode > ending_cond_epis: step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format( episode, round(step_avg_list[episode - 1], 3))) if Exp == 'epsilon' or Exp == 'sparsemax': print( " ( Result : {}, Loss : {}, Epsilon : {}, Steps : {}, Global Steps : {} )" .format(round(step_count, 5), round(cost, 5), round(e, 5), current_step, global_step)) else: print( " ( Result : {}, Loss : {}, Steps : {}, Global Steps : {} )" .format(round(step_count, 5), round(cost, 5), current_step, global_step)) # Save the networks if episode % save_epi == 0: file_case = str(case_n) Q_Network.save_network(game_name=self.file_name + '_seed' + file_case, episode=episode, save_epi=save_epi) with open( '/home/jolp/Desktop/Data/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/jolp/Desktop/Data/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() end_episode += 1 if step_avg_list[episode - 1] > ending_cond_reward: break print("--------------------------------------------------") print("--------------------------------------------------") for episode in range(end_episode + 1, max_episodes + 1): s = env.reset() reward_sum = 0 done = False while not done: #env.render() action = np.argmax( Q_Network.evaluate_critic( np.reshape(state, [1, input_size]))) if conti_action_flag: action = [action_map[action]] else: action = action state, reward, done, _ = env.step(action) reward_sum += reward global_step += 1 #if episode % save_epi == 0: # Q_Network.save_network(episode = episode, save_epi = save_epi) # Action_Network.save_network(episode = episode, save_epi = save_epi) if done: steps_list.append(reward_sum) global_step_list.append(global_step) step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format( episode, round(step_avg_list[episode - 1], 3))) print(" ( Result : {} )".format( reward_sum)) if episode % save_epi == 0: file_case = str(case_n) Q_Network.save_network(game_name=self.file_name + '_seed' + file_case, episode=episode, save_epi=save_epi) with open( '/home/jolp/Desktop/Data/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/jolp/Desktop/Data/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() file_case = str(case_n) with open( '/home/jolp/Desktop/Data/' + self.file_name + '_seed' + file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open( '/home/jolp/Desktop/Data/' + self.file_name + '_global_' + '_seed' + file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, max_episodes + 1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show()
def run_DQN(self, case_n, seed_n, Exp, Double, Dueling, Prioritized): sess = self.sess dis = self.dis REPLAY_MEMORY = self.REPLAY_MEMORY batch_size = self.batch_size Game = self.Game save_epi = self.save_epi max_episodes = self.max_episodes env = self.env input_size = self.input_size output_size = self.output_size alpha = self.alpha beta_init = self.beta_init eps = self.eps eps_div = self.eps_div s_scale = self.s_scale training_step = self.training_step copy_step = self.copy_step repu_num = self.repu_num ending_cond_epis = self.ending_cond_epis ending_cond_reward = self.ending_cond_reward conti_action_flag = self.conti_action_flag action_map = self.action_map env.seed(seed_n) np.random.seed(seed_n) tf.set_random_seed(seed_n) random.seed(seed_n) Q_Network = self.Q_Network end_episode = 0 step_count_total = 0 global_step = 0 loss = 0 replay_buffer = deque() Q_list = [] TD_buffer = deque() steps_list = [] step_avg_list = [] global_step_list = [] print("") print("CASE {}".format(case_n)) print(" STATE DIM : {}, ACTION DIM : {}".format(input_size, self.action_dim)) print(" Exp : {}".format(Exp)) print(" Strategy : Double : {}, Dueling : {}, Prioritized : {}".format(Double, Dueling, Prioritized)) for episode in range(1, max_episodes+1): done = False step_count = 0 current_step = 0 TD_error = 0 state = env.reset() while not done: e = 1. / ((float(episode - 1) / eps_div) + 1) action = Exploration.choice_action(Exp, e, s_scale, Q_Network.evaluate_critic(np.reshape(state, [1, input_size]))[0]) if conti_action_flag: #action = np.array(action_map[action]) action0 = [action_map[action]] else: action0 = action next_state, reward, done, _ = env.step(action0) step_count += reward global_step += 1 current_step += 1 if Prioritized: q_t = np.max(Q_Network.evaluate_critic(np.reshape(state, [1, input_size]))) q_t_1 = np.max(Q_Network.evaluate_critic(np.reshape(next_state, [1, input_size]))) if done: q_t_1 = reward else: q_t_1 = reward + dis*q_t_1 TD_buffer.append(pow(abs(q_t_1-q_t)+eps,alpha)) if len(TD_buffer) > REPLAY_MEMORY: TD_buffer.popleft() replay_buffer.append((state, next_state, action, reward, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state minibatch = [] TD_choice = [] if global_step > batch_size and global_step % training_step == 0: for re in range(repu_num): minibatch = [] TD_choice = [] if Prioritized: #TD_batch = Train.if_prioritized(Q_Network, replay_buffer, input_size, self.action_dim, eps, alpha) TD_batch = np.array(TD_buffer)/sum(TD_buffer) TD_choice = np.random.choice(len(TD_batch), size = batch_size, replace = False, p = TD_batch) for i in range(batch_size): minibatch.append(replay_buffer[TD_choice[i]]) else: minibatch = random.sample(replay_buffer, batch_size) Train.train(Q_Network, minibatch, Exp, s_scale, input_size, self.action_dim) if Prioritized: for i in range(batch_size): state_m, next_state_m, action_m, reward_m, done_m = minibatch[i] q_t = np.max(Q_Network.evaluate_critic(np.reshape(state_m, [1, input_size]))) q_t_1 = np.max(Q_Network.evaluate_critic(np.reshape(next_state_m, [1, input_size]))) if done: q_t_1 = reward_m else: q_t_1 = reward_m + dis*q_t_1 TD_buffer[TD_choice[i]] = pow(abs(q_t_1-q_t)+eps,alpha) if global_step > batch_size and global_step % copy_step == 0: Train.copy(Q_Network) steps_list.append(step_count) global_step_list.append(global_step) # Print the average of result if episode < ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / episode) if episode == ending_cond_epis: step_count_total += steps_list[episode - 1] step_avg_list.append(step_count_total / ending_cond_epis) if episode > ending_cond_epis: step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format(episode, round(step_avg_list[episode - 1], 3))) if Exp == 'epsilon': print (" ( Result : {}, Loss : {}, Epsilon : {}, Steps : {}, Global Steps : {} )" .format(round(step_count, 5), round(loss, 8), round(e, 5), current_step, global_step)) else: print (" ( Result : {}, Loss : {}, Steps : {}, Global Steps : {} )" .format(round(step_count, 5), round(loss, 8), current_step, global_step)) # Save the networks if episode % save_epi == 0: # #Q_Network.save_network(episode = episode, save_epi = save_epi) # Action_Network.save_network(episode = episode, save_epi = save_epi) file_case = str(case_n) with open('/home/jolp/Desktop/Data/'+self.file_name+'_seed'+file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open('/home/jolp/Desktop/Data/'+self.file_name+'_global_'+'_seed'+file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode+1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() end_episode += 1 if step_avg_list[episode - 1] > ending_cond_reward: break print("--------------------------------------------------") print("--------------------------------------------------") for episode in range(end_episode + 1, max_episodes+1): s = env.reset() reward_sum = 0 done = False while not done : #env.render() action = np.argmax(Q_Network.evaluate_critic(np.reshape(state, [1, input_size]))) if conti_action_flag: action = [action_map[action]] else: action = action state, reward, done, _ = env.step(action) reward_sum += reward global_step += 1 #if episode % save_epi == 0: # Q_Network.save_network(episode = episode, save_epi = save_epi) # Action_Network.save_network(episode = episode, save_epi = save_epi) if done : steps_list.append(reward_sum) global_step_list.append(global_step) step_count_total += steps_list[episode - 1] step_count_total -= steps_list[episode - 1 - ending_cond_epis] step_avg_list.append(step_count_total / ending_cond_epis) print("{} {}".format(episode, round(step_avg_list[episode - 1], 3))) print (" ( Result : {} )".format(reward_sum)) if episode % save_epi == 0: file_case = str(case_n) with open('/home/jolp/Desktop/Data/'+self.file_name+'_seed'+file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open('/home/jolp/Desktop/Data/'+self.file_name+'_global_'+'_seed'+file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, episode+1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show() file_case = str(case_n) with open('/home/jolp/Desktop/Data/'+self.file_name+'_seed'+file_case, 'wb') as fout: pickle.dump(step_avg_list, fout) with open('/home/jolp/Desktop/Data/'+self.file_name+'_global_'+'_seed'+file_case, 'wb') as fout2: pickle.dump(global_step_list, fout2) x_values = list(range(1, max_episodes+1)) y_values = step_avg_list[:] plt.plot(x_values, y_values, c='green') plt.title(self.file_name) plt.grid(True) plt.show()