def choose_action(self, state): if self.player_num == 2: state_p1 = self.convert_to_p1_perspective(state) else: state_p1 = state probs, _ = self.brain.predict(state_p1) valid_actions = alpha_gomoku_common.get_valid_actions(state, distance=2) prob_mask = np.array([ 1 if alpha_gomoku_common.index_to_pos(i) in valid_actions else 0 for i in range(BOARD_SIZE) ]) probs *= prob_mask return alpha_gomoku_common.index_to_pos(np.argmax(probs))
def choose_action_training(self, state_p1_np): # if board is empty, place piece in middle if np.count_nonzero(state_p1_np) == 0: best_action = (7, 7) probs = np.zeros(225) probs[112] = 1 return best_action, np.array(probs) probs = self.player.get_probs(state_p1_np) best_action = alpha_gomoku_common.index_to_pos(probs.index(max(probs))) return best_action, probs
def choose_action(self, state): if self.player_num == 2: state = self.convert_to_p1_perspective(state) state_np = np.array(state) if np.count_nonzero(state_np) == 0: best_action = (7, 7) else: probs = self.player.get_probs(state_np) best_action = alpha_gomoku_common.index_to_pos( probs.index(max(probs))) return best_action
def predict(self, board): for action in alpha_gomoku_common.get_valid_actions(board): board[action[0]][action[1]] = 1 win = alpha_gomoku_common.detect_win(board, 1) board[action[0]][action[1]] = 0 if win: return [ 0 if alpha_gomoku_common.index_to_pos(i) != action else 1 for i in range(BOARD_SIZE) ], 0.99 # state_copy = copy.deepcopy(board) # state_copy[action[0]][action[1]] = 1 # if common.detect_win(state_copy, 1): # return [0 if common.index_to_pos(i) != action else 1 for i in range(BOARD_SIZE)], 0.99 # nn_input: 4*15*15, numpy array nn_input = self.convert_to_nn_readable(board) p, v = self.model.predict(nn_input) return p[0], v[0][0]
def choose_action(self, state): if self.player_num == 2: state_p1 = self.convert_to_p1_perspective(state) else: state_p1 = state probs, _ = self.brain.predict(state_p1) valid_actions = alpha_gomoku_common.get_valid_actions(state, distance=2) prob_mask = np.array( [1 if alpha_gomoku_common.index_to_pos(i) in valid_actions else 0 for i in range(BOARD_SIZE)]) probs *= prob_mask nn_action = alpha_gomoku_common.index_to_pos(np.argmax(probs)) if self.forced_actions: next_state = copy.deepcopy(state_p1) next_state[nn_action[0]][nn_action[1]] = 1 next_state_valid_actions = alpha_gomoku_common.get_valid_actions(next_state, distance=1) # force win for action in valid_actions: state_p1[action[0]][action[1]] = 1 if gomoku_pattern_detection.detect_pattern(state_p1, 'ooooo', 1) >= 1: state_p1[action[0]][action[1]] = 0 return action state_p1[action[0]][action[1]] = 0 # force block lose for action in valid_actions: state_p1[action[0]][action[1]] = 2 win = alpha_gomoku_common.detect_win(state_p1, 2) state_p1[action[0]][action[1]] = 0 if win: return action # force create -oooo- for action in valid_actions: state_p1[action[0]][action[1]] = 1 if gomoku_pattern_detection.detect_pattern(state_p1, '-oooo-', 1) >= 1: state_p1[action[0]][action[1]] = 0 return action state_p1[action[0]][action[1]] = 0 # force block -oooo- if gomoku_pattern_detection.detect_pattern(next_state, '-oooo-', 2) >= 1: for action in next_state_valid_actions: next_state[action[0]][action[1]] = 1 if gomoku_pattern_detection.detect_pattern(next_state, '-oooo-', 2) == 0: next_state[action[0]][action[1]] = 0 return action next_state[action[0]][action[1]] = 0 # force block -ooo-- if gomoku_pattern_detection.detect_pattern(next_state, '-ooo--', 2) >= 1: for action in next_state_valid_actions: next_state[action[0]][action[1]] = 1 if gomoku_pattern_detection.detect_pattern(next_state, '-ooo--', 2) == 0: next_state[action[0]][action[1]] = 0 return action next_state[action[0]][action[1]] = 0 # force block double threats for action in next_state_valid_actions: next_state[action[0]][action[1]] = 2 if (gomoku_pattern_detection.detect_pattern(next_state, '-ooo--', 2) - gomoku_pattern_detection.detect_pattern(next_state, '--ooo--', 2)) + \ (gomoku_pattern_detection.detect_pattern(next_state, 'oooo-', 2) - gomoku_pattern_detection.detect_pattern(next_state, '-oooo-', 2)) + \ gomoku_pattern_detection.detect_pattern(next_state, 'oo-oo', 2) \ >= 2: next_state[action[0]][action[1]] = 0 return action next_state[action[0]][action[1]] = 0 # force create double threats for action in valid_actions: state_p1[action[0]][action[1]] = 1 if (gomoku_pattern_detection.detect_pattern(state_p1, '-ooo--', 1) - gomoku_pattern_detection.detect_pattern(state_p1, '--ooo--', 1)) + \ (gomoku_pattern_detection.detect_pattern(state_p1, 'oooo-', 1) - gomoku_pattern_detection.detect_pattern(state_p1, '-oooo-', 1)) + \ gomoku_pattern_detection.detect_pattern(state_p1, 'oo-oo', 1) \ >= 2: state_p1[action[0]][action[1]] = 0 return action state_p1[action[0]][action[1]] = 0 return nn_action
def search(self, state_p1_np): # s_bytes = np.array_str(state_p1_np) s_bytes = state_p1_np.tobytes() s_list = state_p1_np.tolist() # check if reward is assigned to this state yet if s_bytes not in self.s_r: self.s_r[s_bytes] = alpha_gomoku_common.get_reward(s_list, 1) # if is terminal state, return if self.s_r[s_bytes] != 0 or np.count_nonzero( state_p1_np) == BOARD_SIZE: return -self.s_r[s_bytes] # check if is leaf node # if is leaf node, will have no move probabilities yet if s_bytes not in self.s_p: probs, value = self.policy_value_obj.predict(s_list) valid_actions_1d = alpha_gomoku_common.get_valid_actions_1d( s_list, self.valid_actions_distance) self.s_valid_moves[s_bytes] = valid_actions_1d # mask = [1 if a in valid_actions_1d else 0 for a in range(BOARD_SIZE)] # probs = np.array([probs[i] * mask[i] for i in range(BOARD_SIZE)]) probs = np.array([ probs[a] if a in valid_actions_1d else 0 for a in range(BOARD_SIZE) ]) probs_sum = np.sum(probs) # if probs_sum is 0, set all probs to equal numbers # and print error if probs_sum == 0: probs = np.array([1 / probs.size for _ in range(probs.size)]) print('warning: all moves masked') # re-normalize elif probs_sum != 1: probs /= probs_sum self.s_p[s_bytes] = probs self.s_n[s_bytes] = 0 return -value # if code reaches here it means the state is not a leaf node # if not leaf node, select the next node best_node_score = float('-inf') best_action = -1 for a in self.s_valid_moves[s_bytes]: if (s_bytes, a) in self.sa_q: u = self.c_puct * self.s_p[s_bytes][a] * math.sqrt( self.s_n[s_bytes]) / (1 + self.sa_n[(s_bytes, a)]) score = self.sa_q[(s_bytes, a)] + u # q + u else: u = self.c_puct * self.s_p[s_bytes][a] * math.sqrt( self.s_n[s_bytes] + EPS) score = u if score > best_node_score: best_node_score = score best_action = a next_s = self.convert_perspective(state_p1_np) best_action_2d = alpha_gomoku_common.index_to_pos(best_action) next_s[best_action_2d[0]][best_action_2d[1]] = 2 value = self.search(next_s) if (s_bytes, best_action) in self.sa_q: sa_n = self.sa_n[(s_bytes, best_action)] self.sa_q[( s_bytes, best_action)] = (self.sa_n[(s_bytes, best_action)] * self.sa_q[ (s_bytes, best_action)] + value) / (sa_n + 1) self.sa_n[(s_bytes, best_action)] = sa_n + 1 else: self.sa_q[(s_bytes, best_action)] = value self.sa_n[(s_bytes, best_action)] = 1 self.s_n[s_bytes] += 1 return -value