def move(self, position, board, is_my_move=True): ''' returns the board after given movement. you can get either your's or opponent's movement with the parameter "is_my_move" ''' board = copy.deepcopy(board) prediction = Kalah(board) if not is_my_move: prediction.board = reverse_board(board) _, free_turn = prediction.move(position) board = prediction.board if not is_my_move: board = reverse_board(board) return board, prediction.is_game_over(), free_turn
def run_game(self, tree_visualization=True): for i in range(self.num_of_games): if self.am_i_minmax is True: self.user = Minimax() else: module, name = self.user_path.rsplit('.', 1) self.user = getattr(importlib.import_module(module), name)(number_of_simulation=500, simulation_depth=6) if self.is_user_defined_opponent: module, name = self.opponent_path.rsplit('.', 1) self.opponent = getattr(importlib.import_module(module), name)(number_of_simulation=1000) else: self.opponent = Minimax() print("New game!") print("Initial board >>") # initialization: initial_board = [4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 0] new_game = Kalah(initial_board) if i % 2 == 1: new_game.player = False new_game.show_board() if not self.am_i_minmax: self.user.initial_root(initial_board) if self.is_user_defined_opponent: self.opponent.initial_root(initial_board) turn = 0 while not new_game.is_game_over(): turn += 1 # pick a hole: if new_game.player: start_time = time.time() next_position = self.user.search( copy.deepcopy(new_game.get_board())) end_time = time.time() print('measured time: ', end_time - start_time) self.is_time_out(start_time, end_time) else: next_position = self.opponent.search( copy.deepcopy(new_game.get_board())) # update: tmp_score, free_turn = new_game.move(next_position) # print: if not self.am_i_minmax: print("winning rate:", self.user.print_winning_rate(next_position)) if tree_visualization: show_image(self.user.g.render(view=False), auto_close=False) if not self.am_i_minmax: self.user.update_root(next_position, copy.deepcopy(new_game.get_board()), copy.deepcopy(new_game.player)) if self.is_user_defined_opponent: self.opponent.update_root( next_position, copy.deepcopy(new_game.get_board()), copy.deepcopy(not new_game.player)) # end of a game, print result: new_game.show_board() turn = 0 self.score_board(i, new_game.result()) del self.user del self.opponent
def main(): num_episodes = 10000 # player = User(simulation_depth=6, number_of_simulation=1000) if runner.am_i_minmax: runner.user = Minimax() else: runner.user = User(simulation_depth=6, number_of_simulation=200) if runner.is_user_defined_opponent: module, name = runner.opponent_path.rsplit('.', 1) runner.opponent = getattr(importlib.import_module(module), name)(number_of_simulation=1000) else: runner.opponent = Minimax() for i_episode in range(num_episodes): # Initialize the environment and state print("New games for training!") initial_board = [4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 0] new_game = Kalah(initial_board) if i_episode % 2 == 0: new_game.player = False if not runner.am_i_minmax: runner.user.initial_root(initial_board) if runner.is_user_defined_opponent: runner.opponent.initial_root(initial_board) num = 0 loss_sum = 0 for turn in count(): # 행동 선택과 수행 current_board = copy.deepcopy(new_game.get_board()) state = game_state_to_tensor(current_board) if new_game.player: # 내차례 cur_policy = left_policy_net # 왼쪽 모델 cur_target = left_target_net opt = optimizer_left while True: action = select_action(current_board, cur_target) next_position = action.item() if new_game.get_board( )[next_position] != 0 or new_game.is_game_over(): break else: # 적 차례 cur_policy = left_policy_net # 오른쪽 모델 cur_target = left_target_net opt = optimizer_left while True: #action = torch.tensor([[runner.opponent.search(current_board)]], device=device) action = select_action(current_board, cur_target) next_position = action.item() if new_game.get_board( )[next_position] != 0 or new_game.is_game_over(): break _, free_turn = new_game.move(next_position) # 새로운 상태 관찰 next_board = copy.deepcopy(new_game.get_board()) next_state = game_state_to_tensor(next_board) reward = evaluation(new_game.is_game_over(), next_board) # 메모리에 변이 저장 reward = torch.tensor([reward], device=device, dtype=torch.float) memory.push(state, action, next_state, reward) # 로스 계산 loss = optimize_model(free_turn, cur_policy, cur_target, opt) loss_sum += loss if not runner.am_i_minmax: runner.user.update_root(next_position, copy.deepcopy(new_game.get_board()), copy.deepcopy(new_game.player)) if runner.is_user_defined_opponent: runner.opponent.update_root( next_position, copy.deepcopy(new_game.get_board()), copy.deepcopy(not new_game.player)) if new_game.is_game_over(): num = turn break runner.score_board(i_episode, new_game.result()) print(i_episode, 'game Average loss: ', loss_sum / num) print() # 목표 네트워크 업데이트 if i_episode % TARGET_UPDATE == 1: left_target_net.load_state_dict(left_policy_net.state_dict()) #right_target_net.load_state_dict(right_policy_net.state_dict()) if i_episode % 50 == 49: # 50번 마다 한번 저장 torch.save(left_target_net.state_dict(), 'checkpoint_left.pth') runner.wins = 0 runner.losses = 0 runner.draws = 0 #torch.save(right_target_net.state_dict(), 'checkpoint_right.pth') torch.save(left_target_net.state_dict(), 'dqn_cnn_left.pth') #torch.save(right_target_net.state_dict(), 'dqn_cnn_right.pth') print('Complete')