def train(self, result: Result) -> None: batch = [] moves = [] qs = [] reward = result.value if result is Result.DISQUALIFIED: last_board, last_move = self.buffer[-1] batch.append(encode_board(last_board, self.mark, as_batch=False)) moves.append(last_move) qs.append(reward) else: for board, move_hash in self.buffer[::-1]: invalid_moves = get_encoded_invalid_moves(board, self.mark) invalid_moves_qs = [Result.DISQUALIFIED.value ] * len(invalid_moves) batch.append(encode_board(board, self.mark, as_batch=False)) moves.append([move_hash, *invalid_moves]) qs.append([reward, *invalid_moves_qs]) reward *= self.gamma batch = np.array(batch) targets = self.model.predict(batch) for i, (state_moves, state_qs) in enumerate(zip(moves, qs)): targets[i, state_moves] = state_qs self.model.fit(batch, targets, epochs=1, verbose=0) self.buffer = [] if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay
def move(self, board) -> Tuple[int, int, Action]: mark = self.mark if self.training: move_hash = 0 chosen_move = ALL_MOVES[move_hash] if random() < self.epsilon: possible_moves = get_possible_moves(board, mark) if possible_moves: chosen_move = choice(possible_moves) move_hash = ALL_MOVES.index(chosen_move) else: predictions = self.model.predict(encode_board(board, mark))[0] possible_moves_hashes = get_encoded_possible_moves(board, mark) move_hash = next((x for _, x in sort_predictions(predictions) if x in possible_moves_hashes), 0) chosen_move = ALL_MOVES[move_hash] self.buffer.append((board, move_hash)) return chosen_move else: predictions = self.model.predict(encode_board(board, mark))[0] possible_moves_hashes = get_encoded_possible_moves(board, mark) move_hash = next((x for _, x in sort_predictions(predictions) if x in possible_moves_hashes), 0) return ALL_MOVES[move_hash]
def move(self, board) -> Tuple[int, int, Action]: if random() < self.epsilon: possible_moves = get_possible_moves(board, self.mark) if possible_moves: chosen_move = choice(possible_moves) move_hash = ALL_MOVES.index(chosen_move) else: move_hash = 0 chosen_move = ALL_MOVES[move_hash] else: predictions = self.model.predict(encode_board(board, self.mark))[0] possible_moves_hashes = get_encoded_possible_moves( board, self.mark) move_hash = next((x for _, x in sort_predictions(predictions) if x in possible_moves_hashes), 0) chosen_move = ALL_MOVES[move_hash] if self.training: self.tau += 1 if self.tau > self.max_tau: self.tau = 0 self.update_target_model() self.memorize(board, move_hash, 0, False) return chosen_move
def memorize(self, board, move_hash, reward, done): if self.prev_board: prev_state = encode_board(self.prev_board, self.mark, as_batch=False) if done: invalid_moves_mask = [0] * MOVE_SPACE_SIZE curr_state = prev_state else: opponent_mark = self.mark.opposite_mark() invalid_moves_mask = [ board[row][col] is opponent_mark for row, col, _ in ALL_MOVES ] curr_state = encode_board(board, self.mark, as_batch=False) self.memory.append((prev_state, self.prev_move, reward, curr_state, invalid_moves_mask, done)) self.prev_board = board self.prev_move = move_hash