def __init__(self, log_name='MazeEnv', maze=None): if maze is None: self.maze = Maze.build(bounds=(50, 50)) else: self.maze = Maze(bounds=(maze.max_x, maze.max_y), target=maze.target) # if MazeEnv.logger is None: # MazeEnv.logger = Logger("MazeEnv") # self.logger = Logger(log_name, show_in_console=False) self.viewer = None
def __init__(self, log_name='MazeEnv', maze=None): if maze is None: self.maze = Maze.build(bounds=(10, 10), block_cnt=20) else: self.maze = Maze( start=(maze.x, maze.y), bounds=(maze.max_x, maze.max_y), door=maze.door, blocks=maze.blocks) # if MazeEnv.logger is None: # MazeEnv.logger = Logger("MazeEnv") # self.logger = Logger(log_name, show_in_console=False) self.viewer = None
def reset(self): x = random.randint(0, self.maze.max_x - 1) y = random.randint(0, self.maze.max_y - 1) self.maze = Maze.build(bounds=(self.maze.max_x, self.maze.max_y)) if self.viewer is not None: self.viewer.set_maze(self.maze) return self.get_state()
def main(): global env, RL env = Maze('./env/maps/map3.json', full_observation=True) RL = DeepQNetwork( n_actions=4, n_features=env.height * env.width, restore_path=None, # restore_path=base_path + 'model_dqn.ckpt', learning_rate=0.00001, reward_decay=0.9, e_greedy=0.95, replace_target_iter=4e4, batch_size=64, e_greedy_init=0, # e_greedy_increment=None, e_greedy_increment=1e-3, output_graph=False, ) env.after(100, run_maze) env.mainloop()
total_step) except Exception as e: print(e) if __name__ == "__main__": GLOBAL_NET_SCOPE = 'Global_Net' N_S = MazeEnv.state_space_dim N_A = MazeEnv.action_dim SESS = tf.Session() with tf.device("/cpu:0"): # OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') # OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') global_maze = Maze.build(bounds=(80, 80)) # sess, name, N_S, N_A, globalAC, maze=None GLOBAL_AC = A3CNet(SESS, GLOBAL_NET_SCOPE, N_S, N_A) # we only need its params workers = [] # Create worker for i in range(N_WORKERS): i_name = 'W_%i' % i # worker name workers.append( Worker(SESS, i_name, N_S, N_A, GLOBAL_AC, global_maze)) COORD = tf.train.Coordinator() SESS.run(tf.global_variables_initializer()) worker_threads = []
class MazeEnv(object): """ 迷宫的模拟环境 """ action_dim = 4 state_space_dim = 2 def __init__(self, log_name='MazeEnv', maze=None): if maze is None: self.maze = Maze.build(bounds=(10, 10), block_cnt=20) else: self.maze = Maze( start=(maze.x, maze.y), bounds=(maze.max_x, maze.max_y), door=maze.door, blocks=maze.blocks) # if MazeEnv.logger is None: # MazeEnv.logger = Logger("MazeEnv") # self.logger = Logger(log_name, show_in_console=False) self.viewer = None # self.queue = Queue() def reset(self): x = random.randint(0, self.maze.max_x - 1) y = random.randint(0, self.maze.max_y - 1) # x, y = 0, 0 self.maze.set_start((x, y)) # self.maze = Maze.build(bounds=(20, 20), block_cnt=100) if self.viewer is not None: self.viewer.maze = self.maze return self.get_state() # def clear_queue(self): # while not self.queue.empty(): # self.queue.get() def step(self, a): """ 根据动作转换状态,返回新的状态(s), reward, done a: 0=up,1=down,2=left,3=right """ # s = self.get_state() succ = False if a == 3: succ = self.maze.move_up() if a == 2: succ = self.maze.move_down() if a == 1: succ = self.maze.move_left() if a == 0: succ = self.maze.move_right() r = 0 # 每走一步-1分,直到门,相当于策略要用最短的步数走出去 if not succ: # 对撞墙等错误行为惩罚 r = 0 done = False if self.maze.done(): done = True r = 10 # self.logger.debug([s, a, self.get_state(), r, done]) # if self.viewer is not None: # self.viewer.maze.set_start(start=(self.maze.x, self.maze.y)) return self.get_state(), r, done def render(self): if self.viewer is None: self.viewer = MazeViewer(self.maze) self.viewer.render() def get_state(self): return np.hstack([self.maze.x, self.maze.y])
def main(): global env env = Maze('./env/maps/map3.json') env.after(100, run_maze) env.mainloop() # mainloop() to run the application.
arrow = '←' if action == 3: arrow = '→' return arrow # function phi() : used to image preprocessing. Here it is a empty function. def phi(observation): pass return observation if __name__ == "__main__": # get the maze environment env = Maze() # get the DeepQNetwork Agent RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, e_greedy_increment=0.01, output_graph=True, ) # Calculate running time start_time = time.time() run_maze()
def main(): global env, RL, env_model # if_dyna = True # env = Maze('./env/maps/map2.json') # if if_dyna: # # ---------- Dyna Q ---------- # # # from brain.dyna_Q import QLearningTable, EnvModel # RL = QLearningTable(actions=list(range(env.n_actions))) # env_model = EnvModel(actions=list(range(env.n_actions))) # env.after(0, update_dyna_q) # Call function update() once after given time/ms. # else: # # # -------- Q Learning -------- # # # from brain.Q_learning import QLearningTable # RL = QLearningTable(actions=list(range(env.n_actions))) # env.after(0, update_q()) # Call function update() once after given time/ms. time_cmp = [] # -------- Q Learning -------- # # from brain.Q_learning import QLearningTable start = time.time() env = Maze('./env/maps/map2.json') RL = QLearningTable(actions=list(range(env.n_actions))) env.after(0, update_q()) # Call function update() once after given time/ms. env.mainloop() sum_time = time.time() - start time_cmp.append(sum_time) # ---------- Dyna Q ---------- # # from brain.dyna_Q import QLearningTable, EnvModel for n in [5, 10, 25, 50]: start = time.time() env = Maze('./env/maps/map2.json') RL = QLearningTable(actions=list(range(env.n_actions))) env_model = EnvModel(actions=list(range(env.n_actions))) print('Dyna-{}'.format(n)) env.after(0, update_dyna_q, n) # n is the parameter of update_dyna_q(). env.mainloop() # mainloop() to run the application. sum_time = time.time() - start time_cmp.append(sum_time) # This part must after env.mainloop() # plot all lines. all_aver_steps = [np.load('./logs/q_learning/q_learning.npy').tolist()] for n in [5, 10, 25, 50]: all_aver_steps.append( np.load('./logs/dyna_q/dyna_q_{}.npy'.format(n)).tolist()) plot_multi_lines( all_aver_steps, all_labels=['q_learning', 'dyna_5', 'dyna_10', 'dyna_25', 'dyna_50'], save_path='./logs/cmp_all.png') # only plot dyna_Q all_aver_steps = [] for n in [5, 10, 25, 50]: all_aver_steps.append( np.load('./logs/dyna_q/dyna_q_{}.npy'.format(n))[0:100].tolist()) plot_multi_lines(all_aver_steps, all_labels=['dyna_5', 'dyna_10', 'dyna_25', 'dyna_50'], save_path='./logs/cmp_all_dyna_Q.png') print(time_cmp)
# action = rl.choose_action(str(observation)) # observation_, reward, done = env.step(action) # rl.learn(str(observation), action, reward, str(observation_)) # observation = observation_ # if done: # break # on policy rl = SarsaTable(actions=list(range(env.n_actions))) for episode in range(100): observation = env.reset() action = rl.choose_action(str(observation)) while True: observation_, reward, done = env.step(action) action_ = rl.choose_action(str(observation_)) rl.learn(str(observation), action, reward, str(observation_), action_) observation = observation_ action = action_ if done: break print('game over') env.destroy() if __name__ == '__main__': env = Maze() env.after(1000, main) env.mainloop()
total_step) except Exception as e: print(e) if __name__ == "__main__": GLOBAL_NET_SCOPE = 'Global_Net' N_S = MazeEnv.state_space_dim N_A = MazeEnv.action_dim SESS = tf.Session() with tf.device("/cpu:0"): # OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') # OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') global_maze = Maze.build(bounds=(30, 30), block_cnt=200) # sess, name, N_S, N_A, globalAC, maze=None GLOBAL_AC = A3CNet(SESS, GLOBAL_NET_SCOPE, N_S, N_A) # we only need its params workers = [] # Create worker for i in range(N_WORKERS): i_name = 'W_%i' % i # worker name workers.append( Worker(SESS, i_name, N_S, N_A, GLOBAL_AC, global_maze)) COORD = tf.train.Coordinator() SESS.run(tf.global_variables_initializer()) worker_threads = []
class MazeEnv(object): """ 迷宫的模拟环境 """ action_dim = 4 state_space_dim = 5 def __init__(self, log_name='MazeEnv', maze=None): if maze is None: self.maze = Maze.build(bounds=(50, 50)) else: self.maze = Maze(bounds=(maze.max_x, maze.max_y), target=maze.target) # if MazeEnv.logger is None: # MazeEnv.logger = Logger("MazeEnv") # self.logger = Logger(log_name, show_in_console=False) self.viewer = None # self.queue = Queue() def reset(self): x = random.randint(0, self.maze.max_x - 1) y = random.randint(0, self.maze.max_y - 1) self.maze = Maze.build(bounds=(self.maze.max_x, self.maze.max_y)) if self.viewer is not None: self.viewer.set_maze(self.maze) return self.get_state() def step(self, a): """ 根据动作转换状态,返回新的状态(s), reward, done a: 0=up,1=down,2=left,3=right """ # s = self.get_state() if a == 3: self.maze.move_up() if a == 2: self.maze.move_down() if a == 1: self.maze.move_left() if a == 0: self.maze.move_right() r = self.maze.snakes[0].delta_len() - 1 done = False if self.maze.done(): done = True r = -5 return self.get_state(), r, done def render(self): if self.viewer is None: self.viewer = MazeViewer(self.maze) self.viewer.render() def get_state(self): return np.hstack([ self.maze.target[0], self.maze.target[1], self.maze.snakes[0].x, self.maze.snakes[0].y, self.maze.snakes[0].length() ])
def viewer_run(): viewer = MazeViewer(Maze()) while True: viewer.render()
def _calc_block_size(self): w, h = self.get_size() area_w = w * self._get_occupy() area_h = h * self._get_occupy() block_w = area_w / self.maze.max_x block_h = area_h / self.maze.max_y return (block_w, block_h) def viewer_run(): viewer = MazeViewer(Maze()) while True: viewer.render() if __name__ == '__main__': import time import random import logging maze = Maze.build((10, 10)) viewer = MazeViewer(maze) while True: dx, dy = random.randint(-1, 1), random.randint(-1, 1) print(dx, dy) maze.move(dx, dy) viewer.render() time.sleep(0.1)
def main(): global env env = Maze('./env/maps/map1.json', full_observation=True) env.after(100, run_maze) # Call function update() once after given time/ms. env.mainloop() # mainloop() to run the application.