def MCTS_self_play(connectnet, num_games, start_idx, cpu, args, iteration): logger.info("[CPU: %d]: Starting MCTS self-play..." % cpu) if not os.path.isdir("./datasets/iter_%d" % iteration): if not os.path.isdir("datasets"): os.mkdir("datasets") os.mkdir("datasets/iter_%d" % iteration) for idxx in tqdm(range(start_idx, num_games + start_idx)): logger.info("[CPU: %d]: Game %d" % (cpu, idxx)) current_board = c_board() checkmate = False dataset = [] # to get state, policy, value for neural network training states = [] value = 0 move_count = 0 while checkmate == False and current_board.actions() != []: t = args.temperature_MCTS ''' if move_count < 11: t = args.temperature_MCTS else: t = 0.1 ''' states.append(copy.deepcopy(current_board.current_board)) board_state = copy.deepcopy(ed.encode_board(current_board)) root = UCT_search(current_board, 777, connectnet, t) policy = get_policy(root, t) print("[CPU: %d]: Game %d POLICY:\n " % (cpu, idxx), policy) current_board = do_decode_n_move_pieces(current_board,\ np.random.choice(np.array([0,1,2,3,4,5,6,7,8]), \ p = policy)) # decode move and move piece(s) dataset.append([board_state, policy]) print( "[Iteration: %d CPU: %d]: Game %d CURRENT BOARD:\n" % (iteration, cpu, idxx), current_board.current_board, current_board.player) print(" ") if current_board.check_winner() == True: # if somebody won if current_board.player == 0: # black wins value = -1 elif current_board.player == 1: # white wins value = 1 checkmate = True move_count += 1 dataset_p = [] for idx, data in enumerate(dataset): s, p = data if idx == 0: dataset_p.append([s, p, 0]) else: dataset_p.append([s, p, value]) del dataset save_as_pickle("iter_%d/" % iteration +\ "dataset_iter%d_cpu%i_%i_%s" % (iteration, cpu, idxx, datetime.datetime.today().strftime("%Y-%m-%d")), dataset_p)
def MCTS_self_play(connectnet, num_games, cpu): # We want to iterate over num_games games. for idxx in range(0, num_games): # Set initial variables. current_board = c_board() checkmate = False dataset = [] # to get state, policy, value for neural network training states = [] value = 0 move_count = 0 # Keep playing as long as the game is unfinished. while checkmate == False and current_board.actions() != []: # If we are in the first 11 moves, we set the temperature to 1 (meaning more exploration), otherwise we set # it to 0.1. if move_count < 11: t = 1 else: t = 0.1 # Explore the current state up to 777 times. The exploration policy is determined by the output of the # network. See comment at the top of UCT_search function for more info. states.append(copy.deepcopy(current_board.current_board)) board_state = copy.deepcopy(ed.encode_board(current_board)) root = UCT_search(current_board, 777, connectnet, t) policy = get_policy(root, t) print(policy) current_board = do_decode_n_move_pieces(current_board,\ np.random.choice(np.array([0,1,2,3,4,5,6]), \ p = policy)) # decode move and move piece(s) # Record the dataset for future deep learning training. And check if the game is over. dataset.append([board_state, policy]) print(current_board.current_board, current_board.player) print(" ") if current_board.check_winner() == True: # if somebody won if current_board.player == 0: # black wins value = -1 elif current_board.player == 1: # white wins value = 1 checkmate = True move_count += 1 # End while loop dataset_p = [] for idx, data in enumerate(dataset): state, policy = data if idx == 0: dataset_p.append([state, policy, 0]) else: dataset_p.append([state, policy, value]) del dataset save_as_pickle( "dataset_cpu%i_%i_%s" % (cpu, idxx, datetime.datetime.today().strftime("%Y-%m-%d")), dataset_p)
def MCTS_self_play(connectnet, num_games, start_idx, cpu, args, iteration): """ Play with itself using the NN with MCTS. :param connectnet: pytorch model :param num_games: :param start_idx: :param cpu: :param args: :param iteration: :return: """ logger.info("[CPU: %d]: Starting MCTS self-play..." % cpu) if not os.path.isdir("./datasets/iter_%d" % iteration): if not os.path.isdir("datasets"): os.mkdir("datasets") os.mkdir("datasets/iter_%d" % iteration) for idxx in tqdm(range(start_idx, num_games + start_idx)): logger.info("[CPU: %d]: Game %d" % (cpu, idxx)) current_board = c_board() checkmate = False dataset = [] # to get state, policy, value for neural network training states = [] value = 0 move_count = 0 # when the game already produced an outcome, or there is no more room for play, stop while checkmate == False and current_board.actions() != []: if move_count < 11: t = args.temperature_MCTS else: t = 0.1 states.append(copy.deepcopy(current_board.current_board)) board_state = copy.deepcopy(ed.encode_board(current_board)) root = UCT_search(current_board,777,connectnet,t) policy = get_policy(root, t) print("[CPU: %d]: Game %d POLICY:\n " % (cpu, idxx), policy) current_board = do_decode_n_move_pieces(current_board,\ np.random.choice(np.array([0,1,2,3,4,5,6]), \ p = policy)) # decode move and move piece(s) dataset.append([board_state,policy]) print("[Iteration: %d CPU: %d]: Game %d CURRENT BOARD:\n" % \ (iteration, cpu, idxx), current_board.current_board,current_board.player) print(" ") if current_board.check_winner() == True: # if somebody won if current_board.player == 0: # black wins value = -1 elif current_board.player == 1: # white wins value = 1 checkmate = True move_count += 1 dataset_p = [] for idx,data in enumerate(dataset): s,p = data # s,p are states, policy; if idx == 0: # the root has value 0 dataset_p.append([s,p,0]) else: # all the other nodes has a value of -1 or 1 (fixed, same) dataset_p.append([s,p,value]) # I remember only certain things are stored, not all the nodes in MC simulations del dataset # Why saving the dataset_p for later training of the network? save_as_pickle("iter_%d/" % iteration +\ "dataset_iter%d_cpu%i_%i_%s" % \ (iteration, cpu, idxx, \ datetime.datetime.today().strftime("%Y-%m-%d")), dataset_p)