def run(): env = gym.make('CartPole-v0') env = env.unwrapped N_ACTIONS = env.action_space.n N_STATES = env.observation_space.shape[0] RL = DeepQNetwork(N_ACTIONS, N_STATES) step = 0 for i in range(600): # 玩300个回合 # init env observation = env.reset() step_in = 0 while True: # refresh env env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # modify the reward x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r = r1 + r2 RL.store_transition(observation, action, r, observation_) if step > 200 and step % 5 == 0: RL.learn() if done: print('step_in:%s reward:%s' % (step_in, reward)) plot_data.append(step_in) break observation = observation_ step += 1 step_in += 1 # end of game print('game over') # env.destroy() # plot_data = np.array(plot_data, dtype='float32') # plot_data = np.divide(plot_data, plot_data.max()) print(plot_data)
# -*- coding: utf-8 -*- ''' Author: winddy ''' import numpy as np from grid_mdp import GridEnv from DQN_modified import DeepQNetwork env = GridEnv() RL = DeepQNetwork(len(env.getAction()), len(env.getStates()), learning_rate=0.01, reward_decay=0.9, e_greedy=0.1, replace_target_iter=200, memory_size=2000) episodes = 2000 step = 0 for i in range(episodes): state = env.reset() while True: env.render() feature = [0] * len(env.getStates()) feature[state - 1] = 1 feature = np.hstack(feature) action = RL.choose_action(feature) state_, reward, done = env.step(action)
# move = line.strip().split(',') # numbers_move = [int(l) for l in move ] # # print('aaa') # # print(origin_chess) # move_way = trans_action_to_A(origin_chess, numbers_move) # # print(move_way) # if line.strip().__contains__(',') == False : # move = line.strip().split(',') # numbers_move = [float(l) for l in move ] # print(numbers_move) if __name__ == "__main__": #print(S_test[9,8]) #print(trans_S(S_test)) RL = DeepQNetwork(187, 96, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) file_path = os.getcwd() + '\\ajax.txt' parse_txt(file_path) RL.plot_cost() # s = '11.200000000000728' # print(s.__contains__('.')) # print(np.loadtxt(file_path)) # print(trans_action_to_A(S_test,action_test)) # print(trans_A_to_action(S_test,trans_action_to_A(S_test,action_test)))
# RL take action and get next observation and reward observation_, reward, done = env.step(action) RL.store_transition(observation, action, reward, observation_) observation = observation_ if done: print('total: %s, reward: %s' % (step, reward)) break step += 1 time.sleep(1) # end of game print('game over') env.destroy() if __name__ == '__main__': env = Maze() # RL = QLearningTable(actions=list(range(env.n_actions))) # RL = SarsaLambda(actions=list(range(env.n_actions))) RL = DeepQNetwork(n_actions=env.n_actions, n_features=env.n_features, output_graph=True) # 开始可视化环境 env # env.after(100, update) # env.after(100, update_sarsa) env.after(100, update_DQN) # env.after(100, play_once) env.mainloop() RL.plot_cost()
def train_hdqnm(seed, file): # maze game MAZE_H=3 MAZE_W=3 hell_coord = [0,2] door_coord = [2,2] oval_coord = [1,1] np.random.seed(seed) tf.set_random_seed(seed) # maze game env = Maze(MAZE_H, MAZE_W, hell_coord, door_coord, oval_coord) n_goals = 3 max_episode = 10000 controller_start = 200 meta_controller_start = 10000 controller = DeepQNetwork(env.n_actions, env.n_features + 1, 'controller', optimizer='rmsprop', momentum=0.9, learning_rate=1e-3, opt_decay=0.99, reward_decay=0.99, e_greedy=0, e_greedy_max=0.99, e_greedy_increment=1e-4, e_greedy_iter=5e3, replace_target_iter=200, memory_size=5000, output_graph=False, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=1e5, prioritized_replay_eps=1e-6 ) meta_controller = DeepQNetwork(n_goals, env.n_features, 'meta_controller', optimizer='rmsprop', momentum=0.9, learning_rate=1e-3, opt_decay=0.99, reward_decay=0.99, e_greedy=0, e_greedy_max=0.99, e_greedy_increment=1e-4, e_greedy_iter=1e3, replace_target_iter=200, memory_size=500, output_graph=False, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=1e3, prioritized_replay_eps=1e-6, ) def play_maze(): def goal_reached(g, s): return ((s[2 * g + 1] == 0) and (s[2 * g + 2] == 0)) s = env.reset() g = meta_controller.choose_action(s, test=True) score = 0 while True: env.render() a = controller.choose_action(np.hstack((s, g)), test=True) s_, r, done = env.step(a) score += r if done: break s = s_ if goal_reached(g, s): g = meta_controller.choose_action(s, test=True) return score def run_maze(max_episode): def goal_reached(g, s): return ((s[2 * g + 1] == 0) and (s[2 * g + 2] == 0)) def goal_distance(g, s): return np.sqrt(s[2 * g + 1] ** 2 + s[2 * g + 2] ** 2) def reward(g, s_, done): # return 1+1/episode_step if goal_reached(g, s_) else 0 # if done and not goal_reached(g, s_): return -1 return 1 if goal_reached(g, s_) else -1 #return -goal_distance(g, s_) #if goal_reached(g, s_): # return 1 # else: # return -1 # for i in range(n_goals): # if goal_reached(i, s_): # if i == g: return 1 # else: return -1 # return 0 step = 0 score_list = [] avescore_list = [] testscore_list = [] flag = [False, False, False, False] for episode in range(max_episode): # initial observation s = env.reset() score = 0 g = meta_controller.choose_action(s) Done = False while True: F = 0 s0 = s episode_step = 1 while True: env.render() a = controller.choose_action(np.hstack((s, g))) s_, f, done = env.step(a) score += f r = reward(g, s_, done) controller.memory.add(np.hstack((s, g)), a, r, np.hstack((s_, g)), done) if (step > controller_start) and (step % 5 == 0): controller.learn() # if (step > meta_controller_start) and (step % 5 == 0): # meta_controller.learn() if step == meta_controller_start: print('\nmeta controller start learn~~~~~~~~~~~~~~~~~~~~') if step == controller_start: print('\ncontroller start learn~~~~~~~~~~~~~~~~~~~~~~~~~~') F = F + f s = s_ step = step + 1 if done: Done = True break if goal_reached(g, s): break episode_step = episode_step + 1 if goal_reached(g, s): meta_controller.memory.add(s0, g, F, s, done) if step > meta_controller_start: meta_controller.learn() if Done: break g = meta_controller.choose_action(s) # print(step) score_list.append(score) if (episode > 0 and episode % 50 == 0): avescore = np.average(np.array(score_list[-50:])) avescore_list.append(avescore) print("\nepisode %d : average score = %f" % (episode, avescore)) testscore = play_maze() testscore_list.append(testscore) print("episode %d : test score = %f\n" % (episode, testscore)) if avescore > 2.5 and not flag[0]: flag[0] = True with open(file, 'a+') as f: f.write('train average score achieves 2.5 (' + str(avescore) + ')\n') # print('game over') # env.destroy() # return elif avescore > 0.5 and not flag[1]: flag[1] = True with open(file, 'a+') as f: f.write('train average score achieves 0.5 (' + str(avescore) + ')\n') if testscore > 2.5 and not flag[2]: flag[2] = True with open(file, 'a+') as f: f.write('test score achieves 2.5 (' + str(testscore) + ')\n') elif testscore > 0.5 and not flag[3]: flag[3] = True with open(file, 'a+') as f: f.write('test score achieves 0.5 (' + str(testscore) + ')\n') if (episode > 0 and episode % 10 == 0): if step > controller_start: print('controller loss:', np.mean(controller.cost_his[np.max( [0, controller.learn_step_counter - 100]):controller.learn_step_counter])) if step > meta_controller_start: print('meta controller loss:', np.mean(meta_controller.cost_his[np.max( [0, meta_controller.learn_step_counter - 100]):meta_controller.learn_step_counter])) # end of game print('game over') env.destroy() plt.plot(range(len(avescore_list)), avescore_list) plt.show() np.savetxt("E:/my_py_project/rl/log/maze33/hdqnm/SEED_%d_avescore_log.txt" % (seed), avescore_list) np.savetxt("E:/my_py_project/rl/log/maze33/hdqnm/SEED_%d_testscore_log.txt" % (seed), testscore_list) env.after(100, run_maze, max_episode) env.mainloop() controller.plot_cost() meta_controller.plot_cost()
import gym from DQN_modified import DeepQNetwork import matplotlib.pyplot as plt import numpy as np env = gym.make('CartPole-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.01, e_greedy=0.9, replace_target_iter=100, memory_size=1000, ) total_steps = 0 reward_c = [] show = [] running_reward = 0 for i_episode in range(1000): t = 0 observation = env.reset() ep_r = 0 while True: # env.render() action = RL.choose_action(observation)
# action_space=[[0.0, 0.8], [1.0, 0.0]] # action_space = [[0.0, 1.0], [1.0, 0.8]] # action_space = [[1.0, 1.0], [1.0, 0.0],[0.0,0.0],[0.0,1.0]] # action_space=[[0.0,0.8],[0.0,1.0]] # print(action_space) # print(action_space[:4]) # print(action_space[5:]) # print(action_space[:-6]) # print(action_space[-5:]) envSeqDec = ChallengeProveEnvironment() action_space=getActionSpace(envSeqDec)#缩减action space print('actionSpace::::::::::::',action_space) RL = DeepQNetwork(len(action_space), 6, learning_rate=0.0001, reward_decay=0.9, e_greedy=0.9, replace_target_iter=20,#200 memory_size=100,#2000 batch_size=16, # output_graph=True ) # print(RL) # print('\n'.join(['%s:%s' % item for item in RL.__dict__.items()])) rewards=run_maze() print('Best Reward:',np.max(rewards)) x = list(range(len(rewards))) plt.plot(x, rewards) plt.show() # env.mainloop() # RL.plot_cost()
def train_dqn(seed, file): np.random.seed(seed) tf.set_random_seed(seed) # dsdp game env = dsdp() max_episode = 20000 dqn_start = 5000 dqn = DeepQNetwork(env.n_actions, env.n_features, 'dqn', optimizer='rmsprop', momentum=0.9, learning_rate=0.00025, opt_decay=0.99, reward_decay=0.99, e_greedy=0, e_greedy_max=0.99, e_greedy_increment=1e-4, e_greedy_iter=5e3, replace_target_iter=200, memory_size=10000, output_graph=False, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=1e5, prioritized_replay_eps=1e-6) def play_maze(): # def goal_reached(g, s): # if(g == 0): # return (s[0]==0) # else: # return ((s[2*g-1]==0) and (s[2*g]==0)) def goal_reached(g, s): return (s[1] == g) s = env.reset() score = 0 while True: env.render() a = dqn.choose_action(s, test=True) s_, r, done = env.step(a) score += r if done: break s = s_ return score def run_maze(max_episode): step = 0 score_list = [] avescore_list = [] testscore_list = [] flag = [False, False, False, False] for episode in range(max_episode): # initial observation s = env.reset() score = 0 while True: env.render() a = dqn.choose_action(s) s_, r, done = env.step(a) score += r dqn.memory.add(s, a, r, s_, done) if (step > dqn_start) and (step % 5 == 0): dqn.learn() if step == dqn_start: print('\ndqn start learn~~~~~~~~~~~~~~~~~~~~') s = s_ step = step + 1 if done: break score_list.append(score) if (episode > 0 and episode % 50 == 0): # average score avescore = np.average(np.array(score_list[-50:])) avescore_list.append(avescore) print("\nepisode %d : average score = %f" % (episode, avescore)) # test score testscore = 0 for i in range(5): testscore += play_maze() testscore /= 5 testscore_list.append(testscore) print("episode %d : test score = %f\n" % (episode, testscore)) # logs if avescore > 0.1 and not flag[0]: flag[0] = True with open(file, 'a+') as f: f.write('train average score achieves 0.1 (' + str(avescore) + ')\n') #print('game over') #env.destroy() #return elif avescore > 0.02 and not flag[1]: flag[1] = True with open(file, 'a+') as f: f.write('train average score achieves 0.02 (' + str(avescore) + ')\n') if testscore > 0.1 and not flag[2]: flag[2] = True with open(file, 'a+') as f: f.write('test score achieves 0.1 (' + str(testscore) + ')\n') elif testscore > 0.02 and not flag[3]: flag[3] = True with open(file, 'a+') as f: f.write('test score achieves 0.02 (' + str(testscore) + ')\n') # loss if (episode > 0 and episode % 10 == 0): if step > dqn_start: print( 'dqn loss:', np.mean( dqn.cost_his[np. max([0, dqn.learn_step_counter - 100]):dqn.learn_step_counter])) # end of game print('game over') env.destroy() plt.plot(range(len(avescore_list)), avescore_list) plt.show() np.savetxt( "E:/my_py_project/rl/log/dsdp/dqn/SEED_%d_avescore_log.txt" % (seed), avescore_list) np.savetxt( "E:/my_py_project/rl/log/dsdp/dqn/SEED_%d_testscore_log.txt" % (seed), testscore_list) env.after(100, run_maze, max_episode) env.mainloop() dqn.plot_cost()
import gym from DQN_modified import DeepQNetwork env = gym.make('MountainCar-v0') env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = DeepQNetwork( n_actions=3, n_features=2, learning_rate=0.001, e_greedy=0.9, replace_target_iter=300, memory_size=3000, e_greedy_increment=0.0002, ) total_steps = 0 for i_episode in range(10): observation = env.reset() ep_r = 0 while True: env.render() action = RL.choose_action(observation)
step += 1 # end of game print('game over') # print(RL.memory) env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost() import matplotlib.pyplot as plt plt.plot(np.arange(len(episode_reward)), episode_reward) plt.ylabel('episode_reward') plt.xlabel('run steps') plt.show()
def translate_int_action(int_action): act = np.zeros(2) if int_action == 1: act[0] = 1 if int_action == 2: act[1] = 1 return act env = Dogfight() RL = DeepQNetwork(n_actions=3, n_features=14, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=1048576, batch_size=50 * 700, training=True, import_file='saved/trained_dqn') step = 0 score_history = [] for episode in range(600): blue_state, red_state = env.reset() score = 0 #Main game loop while True: blue_action = RL.choose_action(blue_state) red_action = 0 #RL.choose_action(red_state)
from DQN_modified import DeepQNetwork env = Crypto(name='BTC-USD', data_path='./test.csv', start_cash=1000, fee=0.001, drawdown_call=10, fixed_stake=0.001, period=180) RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, e_greedy_increment=0.00001, replace_target_iter=6400, memory_size=100000, # output_graph=True ) total_steps = 0 total_length = env.length for i_episode in range(total_length): observation = env.reset() ep_r = 0 while True:
# swap observation observation = observation_ observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, #每200步替换一次target_net的参数 memory_size=2000, #记忆上线 # output_graph=True #是否输出tensorboard文件 ) env.after(100, run_maze) env.mainloop() RL.plot_cost()