parser.add_argument("-gamma", "--gamma", default=0.95, help="discount factor") parser.add_argument("-alpha", "--alpha", default=0.1, help="learning rate (step size)") parser.add_argument("-plan_step", "--plan_step", default=5, help="planning steps over the learned model") args = parser.parse_args() # Create an environment env = Maze() plan_steps = [0, 5, 50] steps = np.zeros((len(plan_steps), args.episodes)) for run in range(args.runs): for index, plan_step in zip(range(len(plan_steps)), plan_steps): start_time = time.time() args.planning_steps = plan_step # initialize Q table q_value = np.zeros(env.q_size) # generate Dyna-Q model model = InternalModel()
def main(): number_of_turns = 0 #going to use this for counting the number of steps before game end catchCount = 0 #count of game ends env = Maze(FILE_NAME) myCat = Brain('Cat', env.cat.pos, env.actions) myMouse = Brain('Mouse', env.mouse.pos, env.actions) cheesePos = env.cheese.pos board = env.mazeList ## DEBUGING debug = False #Step by step toggle env.renderWindow = False #start with graphics being rendered while True: if debug: print('\nCLICK to start loop.') env.win.getMouse() print('==At start of loop, cat and mouse information:==') myCat.printInfo() myMouse.printInfo() if debug: print('\nCLICK to let mouse choose action.') env.win.getMouse() # print('Calling mouse.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos) mouseAction = myMouse.chooseAction(board, myCat.pos, myMouse.pos, cheesePos) mouseImmediateReward = env.moveMouse(mouseAction) if debug: print('immediate reward:', mouseImmediateReward) print('myMouse.q_table:', myMouse.q_table) print('\nCLICK to let cat choose action.') env.win.getMouse() # print('Calling cat.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos) catAction = myCat.chooseAction(board, myCat.pos, myMouse.pos, cheesePos) catImmediateReward = env.moveCat(catAction) if debug: print('catAction:', catAction) print('immediate reward:', catImmediateReward) print('myCat.q_table:', myCat.q_table) print('\nCLICK to get feedback from environment.') env.win.getMouse() #get feedback from the environment catPos, catReward, mousePos, mouseReward, done = env.turnEnd() #add goal rewards if any catImmediateReward += catReward mouseImmediateReward += mouseReward if debug: print('catPos:', catPos, 'catImmediateReward:', catImmediateReward, 'mousePos:', mousePos, 'mouseImmediateReward:', mouseImmediateReward, 'done:', done) print('catReward:', catReward, 'mouseReward:', mouseReward) print('\nCLICK to update agent Brain with positions.') env.win.getMouse() # Update agent's brains to reflect board positions after move myMouse.updateBrain(catPos, catReward, mousePos, mouseReward) myCat.updateBrain(catPos, catReward, mousePos, mouseReward) myCat.printInfo() myMouse.printInfo() if debug: print('\nCLICK to start learnLast step for both agents.') env.win.getMouse() #immediate learning of step taken myMouse.learnLast(mouseImmediateReward) myCat.learnLast(catImmediateReward) myCat.printInfo() myMouse.printInfo() if debug: print('\nCLICK to continue.') #if something got caught, execute learning of agents if done: # time.sleep(1) catchCount += 1 print('Hit something') if debug: print('mouse q-table before learnAll') print(myMouse.q_table) print('mouse history before learnAll') print(myMouse.history) myMouse.learnAll(mouseReward) myCat.learnAll(catReward) myCat.pos, myMouse.pos, cheesePos = env.restart( ) #using restart() so I can program in random spot spawning # env.win.getMouse() number_of_turns += 1 # if number_of_turns == 100: # break if catchCount % 1000 == 0: env.renderWindow = True if catchCount % 1001 == 2: env.renderWindow = False if (catchCount % 100 == 0): saveAgent(myCat, catchCount) saveAgent(myMouse, catchCount) if catchCount == 1: break
ms, ma = env_model.sample_s_a() # ms in here is a str mr, ms_ = env_model.get_r_s_(ms, ma) RL.learn(ms, ma, mr, str(ms_)) # print(env_model.database) # print('################') # print(RL.q_table) # print('################') s = s_ s2 = s2_ if done: s = env.reset() break if done2: s2 = env.reset2() break # end of game print('game over') print(RL.q_table) env.destroy() if __name__ == "__main__": env = Maze() env_model = EnvModel(actions=list(range(env.n_actions))) RL = Learning(actions=list(range(env.n_actions))) env.after(0, update) env.mainloop()
] ''' Map=\ ['#o#*#o#*#', 'o1o *o^o', '# # # #o#', 'o #####^o', '#*#####o#', 'oo^oo^ o', '#oo^#oo0#' ] human_play=False have_render=True env = Maze(Map) if(have_render): Image = env.render() act=[0,0] score=[0,0] def ch(x): if(x=='l'):return 0 if(x=='r'):return 1 if(x=='u'):return 2 if(x=='d'):return 3 if(x=='b'):return 4 if(x=='s'):return 5 def read_action(event): if event.keysym == 'Left':
def testLoading(itNumber): print('INIT base game..') time.sleep(1) catchCount = itNumber env = Maze(FILE_NAME) myCat = Brain('Cat', env.cat.pos, env.actions) myMouse = Brain('Mouse', env.mouse.pos, env.actions) cheesePos = env.cheese.pos board = env.mazeList print('loading from file') loadAgent(myCat, catchCount) loadAgent(myMouse, catchCount) time.sleep(1) print('showing agent info/q_tables') myCat.printInfo() myMouse.printInfo() time.sleep(1) print('testing running of agents from this point..') time.sleep(1) ## DEBUGING debug = True #Step by step toggle env.renderWindow = True #start with graphics being rendered while True: if debug: print('\nCLICK to start loop.') env.win.getMouse() print('==At start of loop, cat and mouse information:==') myCat.printInfo() myMouse.printInfo() if debug: print('\nCLICK to let mouse choose action.') env.win.getMouse() # print('Calling mouse.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos) mouseAction = myMouse.chooseAction(board, myCat.pos, myMouse.pos, cheesePos) mouseImmediateReward = env.moveMouse(mouseAction) if debug: print('immediate reward:', mouseImmediateReward) print('myMouse.q_table:', myMouse.q_table) print('\nCLICK to let cat choose action.') env.win.getMouse() # print('Calling cat.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos) catAction = myCat.chooseAction(board, myCat.pos, myMouse.pos, cheesePos) catImmediateReward = env.moveCat(catAction) if debug: print('catAction:', catAction) print('immediate reward:', catImmediateReward) print('myCat.q_table:', myCat.q_table) print('\nCLICK to get feedback from environment.') env.win.getMouse() #get feedback from the environment catPos, catReward, mousePos, mouseReward, done = env.turnEnd() #add goal rewards if any catImmediateReward += catReward mouseImmediateReward += mouseReward if debug: print('catPos:', catPos, 'catImmediateReward:', catImmediateReward, 'mousePos:', mousePos, 'mouseImmediateReward:', mouseImmediateReward, 'done:', done) print('catReward:', catReward, 'mouseReward:', mouseReward) print('\nCLICK to update agent Brain with positions.') env.win.getMouse() # Update agent's brains to reflect board positions after move myMouse.updateBrain(catPos, catReward, mousePos, mouseReward) myCat.updateBrain(catPos, catReward, mousePos, mouseReward) myCat.printInfo() myMouse.printInfo() if debug: print('\nCLICK to start learnLast step for both agents.') env.win.getMouse() #immediate learning of step taken myMouse.learnLast(mouseImmediateReward) myCat.learnLast(catImmediateReward) myCat.printInfo() myMouse.printInfo() if debug: print('\nCLICK to continue.') #if something got caught, execute learning of agents if done: catchCount += 1 print('Hit something') if debug: print('mouse q-table before learnAll') print(myMouse.q_table) print('mouse history before learnAll') print(myMouse.history) myMouse.learnAll(mouseReward) myCat.learnAll(catReward) myCat.pos, myMouse.pos, cheesePos = env.restart( ) #using restart() so I can program in random spot spawning if catchCount == 500000: break
from env import Maze from q_learning import QLearning def update(): for episode in range(20): state = env.reset() step_count, done = 0, False while not done: env.render() action = RL.choose_action(str(state)) state_, reward, done = env.step(action) step_count += 1 RL.learn(str(state), action, reward, str(state_)) state = state_ print(' Round over at: {0} round, Total steps: {1} steps'.format( episode, step_count)) if __name__ == '__main__': env = Maze() agent = QLearning(actions=list(range(env.n_actions))) env.after(100, update()) # env.mainloop() print('\n Q Table') print(agent.q_table) agent.q_table.to_csv('Q_Table.csv')
if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ ## break while loop when end of this episode #if done: #break step += 1 time.sleep(60) if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
def __init__(self, wid): self.wid = wid self.env = Maze(Map) self.ppo = GLOBAL_PPO
class Worker(object): def __init__(self, wid): self.wid = wid self.env = Maze(Map) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER while not COORD.should_stop(): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] t = 0 while True: if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) baseline_a = base.choose_action(self.env, 1) s_, r, done = self.env.step({(0, a), (1, U.ch(baseline_a))}) r = r[0] buffer_s.append(s.flatten()) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r t += 1 #print('step : %d, reward : %d, done : %d' % (t, r, done)) GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done: print(GLOBAL_EP) if done: v_s_ = 0 # terminal else: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or GLOBAL_EP >= EP_MAX: print('update') ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if done: # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) break
GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 GLOBAL_RUNNING_R = [] COORD = tf.train.Coordinator() QUEUE = queue.Queue() # workers putting data in this queue threads = [] for worker in workers: # worker threads t = threading.Thread(target=worker.work, args=()) t.start() # training threads.append(t) # add a PPO updating thread threads.append(threading.Thread(target=GLOBAL_PPO.update, )) threads[-1].start() COORD.join(threads) print('aasdas') env = Maze(Map) tf.reset_default_graph() load_PPO = PPO(Load=True) while True: s = env.reset() for t in range(100): env.render() a = load_PPO.choose_action(s) baseline_a = base.choose_action(env, 1) s, r, done = env.step({(0, a), (1, U.ch(baseline_a))}) if (r[0] != -1): print(r) if (done): break