def play_game(state_list, action_list, Qtable): step = 0 # 合計ループ数 old_state = new_state = [] #- state flag = 0 reset() pt.init(0.5) # メインループ while(pt.alive()): ######################### # ループごとの解析、操作をここに書く if step % 180 == 0 and flag == 0: new_board = pt.getBoard() for i in range(len(new_board)): for j in range(len(new_board[i])): if new_board[i][j] > 0 and new_board[i][j] < 8: new_board[i][j] = 1 piece = pt.getPiece() new_state = [new_board, piece] if new_state != old_state and flag == 0: action, value = egreedy.search_action(new_state) if action == []: action = egreedy.eGreedy(new_state) old_state = new_state flag = 1 if step % 10 == 0 and flag == 1: flag = doaction(action, piece) if step % 100 == 0: flag = 2 # print(new_state) ######################### # 次のループへ pt.loop() if flag == 2: print pt.param.flag['update'] pt.move('down') if pt.param.flag['update']: print "update" pt.loop() flag = 0 if pt.param.flag['gameover']: alive = flag = 0 step += 1 # 終了 pt.quit()
def qLearning(maxsteps = 100, rate_a = 0.6, rate_g = 0.3): #- state = [board_state, tetrimino]: 盤面の状態, テトリミノ #- steps, maxsteps: エピソード数 #- state_list: 状態リスト.盤面の状態と落ちてきている駒 #- action_list: 行動リスト.[横移動回数,回転の回数]を保持 #- Qtable: Q値を格納する二次元配列 steps = 0 flag = 0 while steps < maxsteps: rate = change_rate(steps/maxsteps, rate_a, rate_g) alpha, gamma = rate_a * rate, rate_g * rate #state_list = [] pt.init(0.5) alive = 1 old_board = board = [] print "step:", steps, "start" while alive: #- ゲームオーバーになるまで if flag == 0: board = pt.getBoard() if old_board != board: for i in range(0,len(board)): for j in range(0,len(board[i])): if board[i][j] != 0 and board[i][j] != 8: board[i][j] = 1 piece = pt.getPiece() state = [board, piece] #- 行動の探査・搾取 action = egreedy.eGreedy(state) # softmaxでもよいが、今はとりあえず #- state_list, action_list, Qtableの更新 if action not in lists.action_list: lists.action_list.append(action) add_state(state) updateQ.updateQ(state, action, alpha, gamma) #- actionの実行 old_board = board flag = 1 #次の状態を準備 if flag == 1: step = 0 while flag != 2: flag = doaction(action, piece) step += 1 if step % 100 == 0: flag = 2 elif flag == 2: pt.loop() #pt.move('down') pt.drop() if pt.param.flag['update']: pt.loop() flag = 0 if pt.param.flag['gameover']: alive = flag = 0 steps += 1 reset() check = 1 # if len(lists.Qtable) != len(lists.state_list): print "check" while check: if [] in lists.Qtable: lists.Qtable.remove([]) else: check = 0 # if len(lists.Qtable) != len(lists.state_list): print len(lists.state_list),len(lists.Qtable) # flag = steps / maxsteps return lists.state_list, lists.action_list, lists.Qtable
trials = 50 plays = 1000 samples = 20 rewards1 = np.zeros((trials, plays)) rewards2 = np.zeros((trials, plays)) rewards3 = np.zeros((trials, plays)) optimal_rewards = np.zeros((trials, plays)) #env = bern_bandit.BernBandits() for t in range(trials): #env = bandits.Bandits() env = bern_bandit.BernBandits() strat1 = egreedy.eGreedy(env.n) strat2 = ucb.UCB(env.n) strat3 = thompson.Thompson(env.n) for i in range(plays): arm1 = strat1.action() arm2 = strat2.action() arm3 = strat3.action() r1 = env.pull(arm1) r2 = env.pull(arm2) r3 = env.pull(arm3) strat1.update(arm1, r1) strat2.update(arm2, r2) strat3.update(arm3, r3)