def _choose(self, state, available_actions): if self.is_train_mode and random.random() < self.exploit_rate: return random.choice(available_actions) ob = OptimalBoard(state) converted_actions = ob.convert_action_to_optimal(available_actions) action = self.q.rargmax(ob.board_id, converted_actions) return ob.convert_action_to_original(action)
def _feedback(self, state, action, next_state, reward, done): state_ob = OptimalBoard(state) converted_action = state_ob.convert_action_to_optimal(action) converted_state = self.convert_state(state_ob.optimal_board) next_ob = OptimalBoard(next_state) converted_next_state = self.convert_state(next_ob.optimal_board) self.network.add_train_set(converted_state, converted_action, reward, converted_next_state, done) self.network.study()
def find_next(board, color, seq): actions = SP.available_actions(board) for action in actions: new_board = board[:] reward, done = SP.play(new_board, action, color) if done == True: # print it? if reward == 0: print(seq + str(action), '=', OB.board_to_id(new_board)) else: print(seq + str(action), MARKER[color], OB.board_to_id(new_board)) else: find_next(new_board, SP.next(color), seq + str(action))
def _choose(self, state, actions): if self.is_train_mode and random.random() < self.exploit_rate: next_pos = random.choice(actions) if self.debug: print("SELECT", actions, "RANDOM", next_pos) return next_pos found_p = -1.0 found_c = [] ob = OptimalBoard(state) _id = ob.board_id if self.debug: print("FROM", _id) scores = self.p_table.lookup(_id) converted_actions = ob.convert_action_to_optimal(actions) for action in converted_actions: p = scores[action] if self.debug: print("ACTION", ob.convert_action_to_original(action), p) if p > found_p: found_p = p found_c = [ob.convert_action_to_original(action)] elif p == found_p: found_c.append(ob.convert_action_to_original(action)) next_pos = random.choice(found_c) if self.debug: print("SELECT", found_c, found_p, next_pos) return next_pos
def _calculate_reward(self, history, final_reward): ''' convert turn history to learning data and calculate reward (multiply gamma) ''' replay_buffer = [] size = len(history) for idx, turn in enumerate(history): optimal_board = OptimalBoard(turn[self.HISTORY_STATE]) converted_action = optimal_board.convert_action_to_optimal( turn[self.HISTORY_ACTION]) converted_state = self.convert_state(optimal_board.optimal_board) replay_buffer.append([ converted_state, converted_action, final_reward * GAMMA**(size - idx - 1) ]) running_add = final_reward for i in reversed(range(len(replay_buffer))): replay_buffer[i][2] = running_add # 2 is reward of every turn running_add = running_add * GAMMA return replay_buffer
def negamax(self, state, color, depth=10): ''' implement negamax algorithm https://en.wikipedia.org/wiki/Negamax ''' # negamax.counter += 1 # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small # LEAF NODE is checked on play time # Transposition Table related work (state) # _id = ob.board_id _id = OptimalBoard.board_to_id(state) cache = self.tp.get(_id) if cache is not None: # BUG FIX! cache can be 0, so should check None # case 1 # return cache # case 2 return cache[0], random.choice(cache[1]) # RECURSIVE actions = SP.available_actions(state) random.shuffle( actions) # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐 best_score = -math.inf best_actions = [] for action in actions: next_s = state[:] score, done = SP.play(next_s, action, color) if not done: score, _ = self.negamax(next_s, SP.next(color), depth - 1) score = -score # negamax # pick from all best moves if score > best_score: best_score = score best_actions = [action] elif score == best_score: best_actions.append(action) # case 1: choose random value 1 time # choosed_result = random.choice(best_scores) # tp.put(_id, choosed_result) # return choosed_result # case 2: choose random value every time self.tp.put(_id, (best_score, best_actions)) return (best_score, random.choice(best_actions))
def _choose(self, state, available_actions): optimal_board = OptimalBoard(state) converted_actions = optimal_board.convert_action_to_optimal( available_actions) converted_state = self.convert_state(optimal_board.optimal_board) ### if self.is_train_mode: if random.random() < self.egreedy: action = random.choice(converted_actions) else: action = self.network.predict_one(converted_state) else: action = self.network.predict_one(converted_state) if action not in converted_actions: # 여기에 뭐를 학습으로 넣을 지 고민 # 아니면, predict_one에서 필터를 넣을 지 고민 self.network.add_train_set(converted_state, action, -1, self.convert_state([-1] * 9), True) action = random.choice(converted_actions) original_action = optimal_board.convert_action_to_original(action) return original_action
def _episode_feedback(self, reward): # for winner history_left = reversed(self.all_history()) (state, action, _, _, _) = next(history_left) # pop last history and set it. ob = OptimalBoard(state) reward = self.p_table.set(ob.board_id, ob.convert_action_to_optimal(action), reward) for (state, action, _, _, _) in history_left: ob = OptimalBoard(state) reward = self.p_table.learn(ob.board_id, ob.convert_action_to_optimal(action), reward)
def negamax_alpha_beta_pruning(self, state, color, alpha=-math.inf, beta=math.inf, depth=10): ''' implement negamax algorithm with alpha-beta purning https://en.wikipedia.org/wiki/Negamax ''' # negamax.counter += 1 # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small # LEAF NODE is checked on play time orig_alpha = alpha # Transposition Table related work # ob = OptimalBoard(state) # _id = ob.board_id _id = OptimalBoard.board_to_id(state) cache = self.tp.get(_id) if cache and cache['depth'] >= depth: (cached_score, cached_action) = cache['value'] if cache['flag'] == self.tp.EXACT: return (cached_score, cached_action) elif cache['flag'] == self.tp.LOWERBOUND: alpha = max(alpha, cached_score) elif cache['flag'] == self.tp.UPPERBOUND: beta = min(beta, cached_score) if alpha >= beta: return cached_score, cached_action # else: # print("MISS", t.seq) # RECURSIVE actions = SP.available_actions(state) random.shuffle( actions) # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐 best_score = -math.inf best_move = -1 for action in actions: next_s = state[:] score, done = SP.play(next_s, action, color) if not done: score, _ = self.negamax_alpha_beta_pruning(next_s, SP.next(color), alpha=-beta, beta=-alpha, depth=depth - 1) score = -score # negamax # just pick up 1 first best move (random.shuffle make randomness) if best_score < score or (score == best_score and random.random() < 0.5): best_score = score best_move = action if alpha < score: alpha = score # 결국 alpha = max(alpha, best_score) if alpha > beta: break if best_score <= orig_alpha: flag = self.tp.UPPERBOUND elif best_score >= beta: flag = self.tp.LOWERBOUND else: flag = self.tp.EXACT self.tp.put(key=_id, depth=depth, value=(best_score, best_move), flag=flag) return (alpha, best_move)
def _feedback(self, state, action, next_state, reward, done): ob1 = OptimalBoard(state) ob2 = OptimalBoard(next_state) self.q.learn(ob1.board_id, ob1.convert_action_to_optimal(action), reward, ob2.board_id)
def to_board_id(board): ''' board id to make node ''' return OptimalBoard(board).board_id