Python PolicyValueNet 예제들, policy_value_net.PolicyValueNet Python 예제들

예제 #1

0

파일 보기

 def __init__(self):
     # 게임(오목)에 대한 변수들
     self.board_width, self.board_height = 9, 9
     self.n_in_row = 5
     self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row)
     self.game = Game(self.board)
     
     # 학습에 대한 변수들
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # KL에 기반하여 학습 계수를 적응적으로 조정
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.batch_size = 512  # mini-batch size : 버퍼 안의 데이터 중 512개를 추출
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 500  # 지정 횟수마다 모델을 체크하고 저장. 원래는 100이었음.
     self.game_batch_num = 3000  # 최대 학습 횟수
     self.train_num = 0 # 현재 학습 횟수
     
     # policy-value net에서 학습 시작
     self.policy_value_net = PolicyValueNet(self.board_width, self.board_height)
     
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)

예제 #2

0

파일 보기

 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = Quoridor()
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 1000  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 1000  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 64
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 2000
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(model_file=init_model, use_gpu=True)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet()
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #3

0

파일 보기

 def __init__(self, init_model=None):
     self.board = Board()
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     #                                                      self.board_height,
     self.temp = 1.0  # the temperature param
     self.n_playout = 1600  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 15000
     self.best_win_ratio = 0.0
     self.pure_mcts_playout_num = 1000
     if init_model:
         self.policy_value_net = PolicyValueNet(model_file=init_model,
                                                use_gpu=True)
     else:
         self.policy_value_net = PolicyValueNet(use_gpu=True)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
     print("init done")

예제 #4

0

파일 보기

 def __init__(self, init_model=None):
     # 棋盘参数
     self.game = Quoridor()
     # 训练参数
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # 适应性调节学习速率
     self.temp = 1.0
     self.n_playout = 400
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 128  # 取1 测试ing
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     self.pure_mcts_playout_num = 1000
     if init_model:
         self.policy_value_net = PolicyValueNet(model_file=init_model)
     else:
         self.policy_value_net = PolicyValueNet()
     # 设置电脑玩家信息
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #5

0

파일 보기

파일: train.py 프로젝트: zhuzhenping/General-Zero

    def __init__(self):
        # params of the board and the game
        self.board_width = 5
        self.board_height = 5
        self.game = Game()
        # training params
        self.learn_rate = 0.001
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 500  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 128  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 100
        self.game_batch_num = 2000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 3000

        # start training from a new policy-value net
        self.policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

예제 #6

0

파일 보기

파일: train.py 프로젝트: fupip/four

    def __init__(self, init_model=None):
        # params of the game
        self.width = 4
        self.height = 4
        self.game = Game()
        # params of training
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0
        self.n_playout = 300
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 64
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 5000
        self.best_win_ratio = 0.0

        self.pure_mcts_playout_num = 500

        if init_model:
            self.policy_value_net = PolicyValueNet(self.width,
                                                   self.height,
                                                   model_file=init_model)
        else:
            self.policy_value_net = PolicyValueNet(self.width, self.height)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

예제 #7

0

파일 보기

 def run(self):
     """run the training pipeline"""
     try:
         for i in range(self.game_batch_num):
             self.collect_selfplay_data(self.play_batch_size)
             print("batch i:{}, episode_len:{}".format(
                     i+1, self.episode_len))
             if len(self.data_buffer) > self.batch_size:
                 loss, entropy = self.policy_update()
             # check the performance of the current model,
             # and save the model params
             if (i+1) % self.check_freq == 0:
                 print("current self-play batch: {}".format(i+1))
                 # win_ratio = self.policy_evaluate(n_games=1)
                 self.policy_value_net.save_model('./best_policy.model')
                 self.policy_value_net = PolicyValueNet('./best_policy.model', use_gpu=True)
                 # if win_ratio > self.best_win_ratio:
                 #     print("New best policy!!!!!!!!")
                 #     self.best_win_ratio = win_ratio
                 #     # update the best_policy
                 #     self.policy_value_net.save_model('./best_policy.model')
                 #     if (self.best_win_ratio == 1.0 and
                 #             self.pure_mcts_playout_num < 5000):
                 #         self.pure_mcts_playout_num += 1000
                 #         self.best_win_ratio = 0.0
     except KeyboardInterrupt:
         print('\n\rquit')

예제 #8

0

파일 보기

파일: train.py 프로젝트: syys96/Attax-Zero

 def __init__(self):
     # params of the board and the game
     self.board_width = BOARD_SIZE
     self.board_height = BOARD_SIZE
     self.board = Board()
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 300  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     self.check_freq = 1
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     self.episode_len = 0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 300
     # start training from a given policy-value net
     #        policy_param = pickle.load(open('current_policy.model', 'rb'))
     #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     # start training from a new policy-value net
     self.policy_value_net = PolicyValueNet(self.board_width,
                                            self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #9

0

파일 보기

    def __init__(self, init_model=None):
        self.game = Quoridor()


        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0
        self.n_playout = 200
        self.c_puct = 5
        self.buffer_size = 10000
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.kl_targ = 0.02
        self.check_freq = 10
        self.game_batch_num = 1000
        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = 1000

        self.old_probs = 0
        self.new_probs = 0

        self.first_trained = False

        if init_model:
            self.policy_value_net = PolicyValueNet(model_file=init_model)
        else:
            self.policy_value_net = PolicyValueNet()

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct,
                                      n_playout=self.n_playout, is_selfplay=1)

예제 #10

0

파일 보기

파일: train.py 프로젝트: liangsheng02/Othello-Zero

 def __init__(self, init_model=None):
     # params of the board and the game
     self.n = 8
     self.board = Board(self.n)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_play_out = 400  # number of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.epochs = 5  # number of train_steps for each update
     self.kl_target = 0.025
     self.check_freq = 50
     self.game_batch_number = 10000
     self.best_win_ratio = 0.0
     self.episode_length = 0
     self.pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
     # number of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.last_batch_number = 0
     self.pure_mcts_play_out_number = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.n,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.n)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_func,
                                   c_puct=self.c_puct,
                                   n_play_out=self.n_play_out,
                                   is_self_play=1)

예제 #11

0

파일 보기

class TrainPipeline():
    def __init__(self, init_model=None):

        # params of the board and the game
        self.game = Game()
        # training params
        self.config = TrainConfig()
        self.greedy_config = TrainGreedyConfig()
        self.data_buffer = deque(maxlen=self.config.buffer_size)
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet()

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.config.c_puct,
                                      n_playout=self.config.n_playout,
                                      is_selfplay=1)
        self.mcts_player_greedy = MCTSPlayerGreedy(
            self.policy_value_net.policy_value_fn,
            c_puct=self.greedy_config.c_puct,
            n_playout=self.greedy_config.n_playout,
            is_selfplay=1)

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(
                self.mcts_player,
                temp=self.config.temp,
                greedy_player=self.mcts_player_greedy,
                who_greedy="B")
            play_data = list(play_data)
            # augment the data
            play_data = symmetry_board_moves(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        state_batch = [data[0] for data in self.data_buffer]
        mcts_probs_batch = [data[1] for data in self.data_buffer]
        winner_batch = [data[2] for data in self.data_buffer]
        self.policy_value_net.train(state_batch, mcts_probs_batch,
                                    winner_batch, self.config.epochs)
        self.policy_value_net.save_model("model.h5")

    def run(self):
        """run the training pipeline"""
        try:
            self.collect_selfplay_data(self.config.play_batch_size)
            self.policy_update()
        except KeyboardInterrupt:
            print('\n\rquit')

    def summary(self):
        self.policy_value_net.model.summary()

예제 #12

0

파일 보기

 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     # add output log
     self.formatter = logging.Formatter('%(asctime)s [%(module)s] %(levelname)s: %(message)s', '%Y-%m-%d %H:%M:%S')
     self.logger = logging.getLogger(__name__)
     self.logger.setLevel(level=logging.INFO)
     self.handler = logging.FileHandler("output.log")
     self.handler.setLevel(logging.INFO)
     self.handler.setFormatter(self.formatter)
     self.console = logging.StreamHandler()
     self.console.setLevel(logging.INFO)
     self.console.setFormatter(self.formatter)
     self.logger.addHandler(self.handler)
     self.logger.addHandler(self.console)
     
     if init_model:
         if os.path.exists(init_model):
             # start training from an initial policy-value net
             self.policy_value_net = PolicyValueNet(self.board_width,
                                                    self.board_height,
                                                    model_file=init_model)
          else:
             self.logger.error("{} does not exists!\n".format(init_model))
             return -1
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #13

0

파일 보기

파일: main.py 프로젝트: FiveEyes/GomokuZero

def smart_worker_train():
    pvnet = PolicyValueNet(board_n, model_filename)
    server = SmartServer(pvnet)
    #print("Training")
    #server.train_fn(*server.mem.get_history())
    #print("Done")
    while True:
        server.train()
        pvnet.save_model(model_filename)

예제 #14

0

파일 보기

파일: train.py 프로젝트: 1715509415/AlphaGo_Zero_for_Gomoku

    def __init__(self):
        # params of the board and the game
        self.board_width = w
        self.board_height = h
        self.n_in_row = l
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 5e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.025
        self.check_freq = p
        self.game_batch_num = r
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        print(
            datetime.datetime.now(a),
            "init....{}x{}x{}".format(self.board_width, self.board_height,
                                      self.n_in_row))
        # start training from a given policy-value net
        if os.path.isfile('current_policy_{}_{}_{}.model'.format(
                self.board_width, self.board_height, self.n_in_row)):
            print(
                "load old AI model ",
                'current_policy_{}_{}_{}.model'.format(self.board_width,
                                                       self.board_height,
                                                       self.n_in_row))
            policy_param = pickle.load(open(
                'current_policy_{}_{}_{}.model'.format(self.board_width,
                                                       self.board_height,
                                                       self.n_in_row), 'rb'),
                                       encoding='bytes')
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   net_params=policy_param)
        else:
            # start training from a new policy-value net
            print("init new AI model")
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

예제 #15

0

파일 보기

파일: main.py 프로젝트: FiveEyes/GomokuZero

def simple_train():
    #board = Board(board_n, win)
    #game = Game()
    pvnet = PolicyValueNet(board_n, model_filename)
    #mcts_player = MCTSPlayer(pvnet.get_pvnet_fn())
    #bh, ph, vh = game.selfplay(board, mcts_player)
    #bh, ph, vh = game.selfplay(board, HumanPlayer())
    #print(vh)
    while True:
        train(pvnet, config.train_config['train_samples'])
        pvnet.save_model(model_filename)

예제 #16

0

파일 보기

def main():		
	human = HumanWASDPlayer()
	play(human, human)
	pvnet = PolicyValueNet(board_n, model_filename)
	mem = Memory()

	while True:
		
		mcts_player = MCTSPlayer(pvnet.get_pvnet_fn(), play_style = 3)
		bh, ph, vh = play(mcts_player, human, mcts_player)
		mem.save_data((bh, ph, vh))
		
		mcts_player = MCTSPlayer(pvnet.get_pvnet_fn(), play_style = 3)
		bh, ph, vh = play(human, mcts_player, mcts_player)
		mem.save_data((bh, ph, vh))

예제 #17

0

파일 보기

파일: human_play.py 프로젝트: shellpicker/Reversi

def run():
    n = 5
    width, height = 8, 8
    model_file = 'current_policy_10.21.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3
        best_policy = PolicyValueNet(width, height, model_file)
        mcts_player = MCTSPlayer_Alphago(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')

예제 #18

0

파일 보기

파일: tools.py 프로젝트: zwdnet/FiveZiQi

def RLput(board, who, n_playout=400):
    model_file = "./best_policy.model"
    # policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')
    best_policy = PolicyValueNet(board.width,
                                 board.height,
                                 model_file=model_file)
    mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                             c_puct=5,
                             n_playout=400)
    # 设置当前下棋者，使用do_move的才要
    # board.set_current_player(who)
    # 如果是先手，随机下一个地方
    last = board.getLast()
    if last == [-1, -1]:
        row = random.randint(2, 5)
        col = random.randint(2, 5)
        if board[row][col] == 0:
            move = board.location_to_move((row, col))
            if board.do_move(move):
                return True
        return False
    # 不是先手
    move = mcts_player.get_action(board)
    #print(board.current_player, who)
    #    input("按任意键继续")
    return board.do_move(move)

예제 #19

0

파일 보기

파일: human_play.py 프로젝트: zhuzhenping/General-Zero

def run():
    # play the chess with human
    game = Game()

    # log, the model for training 1500 is suck, maybe the value is not prepared and need to be
    # trained more times - 2018.7.11
    best_policy = PolicyValueNet(5, 5)
    mctsplayer = MCTSPlayer(best_policy.policy_value_fn,
                            c_puct=5,
                            n_playout=10000)
    puremctsplayer = PURE(c_puct=5, n_playout=10000)
    human = Human()
    '''
    # human first, red
    win = {1: 0, 2: 0}
    for i in range(50):
        a = time.time()
        winner = game.start_play(puremctsplayer, mctsplayer, 1, 2, (i % 2 + 1), is_show=1)
        if winner == 1: win[1] += 1
        else: win[2] += 1
        # print(i, 'winner is', 'red' if winner == 1 else 'blue')
        print(i, 'blue win rate:', win[2] / (i + 1))
        print(i, 'cost:', time.time() - a, 's')
    # print('win rating ...', win[2] / 100)
    '''
    '''
    import time
    a = time.time()
    game.start_self_play(mctsplayer, is_show=1)
    print(time.time() - a)
    '''
    game.start_play(human, mctsplayer, 1, 2, 1, is_show=1)

예제 #20

0

파일 보기

파일: human_play.py 프로젝트: excelmaxx/Alphazero_for_Gomoku

def run():
    n = 4
    width, height = 6, 6
    model_file = './models_664_origpure_nofrust/current_policy_3450.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in TensorFlow

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=800,
                                 is_selfplay=0,
                                 disp=True)

        # for MCTS without neural network
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=5)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')

예제 #21

0

파일 보기

def net_mcts_play(board_width, board_height, n_in_row):
    game_board = GomokuBoard(width=board_width,
                             height=board_height,
                             n_in_row=n_in_row)
    brain = PolicyValueNet(board_width, board_height,
                           ".\\CurrentModel\\GomokuAi")
    net_player = MCTSPlayer(brain.policy_value, 2000)
    mcts_player = MCTSPlayer(rollout_policy_value, 10000)

    while True:
        action = mcts_player.get_action(game_board)
        game_board.move(action)
        end, winner = game_board.check_winner()

        game_board.dbg_print()
        if end:
            print(winner)
            break

        action, prob = net_player.get_action_prob(game_board)
        game_board.move(action)
        end, winner = game_board.check_winner()
        print(action, prob)
        game_board.dbg_print()
        if end:
            print(winner)
            break

예제 #22

0

파일 보기

def run():
    width, height, n = 6, 6, 4
    board = Board(width=width, height=height, n_in_row=n)
    best_policy = PolicyValueNet(width, height, 'current_policy.model')
    # Below set larger n_playout for better performance
    mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                             c_puct=5,
                             n_playout=400)
    board.init_board(0)
    p1, p2 = board.players
    mcts_player.set_player_ind(p2)
    players = {p1: 'Human Player', p2: mcts_player}

    app_tk = tk.Tk()
    app_tk.resizable(False, False)
    app_tk.geometry('{}x{}+{}+{}'.format(cell_size * width, cell_size * height,
                                         cell_size, cell_size))
    app_tk.title('Human VS AI - Gomoku')
    for x in range(width):
        cells_column = []
        gui_cells.append(cells_column)
        for y in range(height):
            cells_column.append(GuiCell(app_tk, board, players, (x, y)))

    app_tk.mainloop()

예제 #23

0

파일 보기

파일: play.py 프로젝트: FlashZoom/Gomuku-AI

def run():
    n = 5
    width, height = 10, 10
    model_file = 'current_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)
        graphic = Graphic()
        # graphic.run()
        print(1111)
        # thread1 = threading.Thread(target=graphic.run, args=())
        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file='./model/' + model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=1200)
        human = Human(graphic)
        # set start_player=0 for human first
        thread2 = threading.Thread(target=game.start_play,
                                   args=(human, mcts_player, graphic, 1, 1))
        # game.start_play(human, mcts_player, graphic, start_player=0, is_shown=1)
        thread2.setDaemon(True)
        thread2.start()
        graphic.run()
    except KeyboardInterrupt:
        print('\n\rquit')

예제 #24

0

파일 보기

파일: train2.py 프로젝트: excelmaxx/Alphazero_for_Gomoku

 def __init__(self, init_model=None):
     # params of the board and the game
     # basic params
     self.board_width = 9
     self.board_height = 9
     self.n_in_row = 5
     # init the board and game
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 3e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1e-3  # the temperature param
     # self.n_playout = 400  # num of simulations for each move
     self.n_playout = 400
     self.c_puct = 3  # a number in (0, inf) that controls how quickly exploration
     # converges to the maximum-value policy. A higher value means
     # relying on the prior more.
     self.buffer_size = 10000
     # self.batch_size = 512  # mini-batch size for training
     self.batch_size = 256
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1000
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 400
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #25

0

파일 보기

    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 6    #棋盘宽度
        self.board_height = 6   #棋盘高度
        self.n_in_row = 4       #胜利条件：多少个棋连成一线算是胜利

        # 实例化一个board，定义棋盘宽高和胜利条件
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)

        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000

        #初始化network和树，network是一直保存的，树的话不知道什么时候重置。
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

예제 #26

0

파일 보기

    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_length = 6
        self.n_in_row = 4
        self.num_history = 2
        self.chess = chessboard(self.board_length, self.n_in_row)
        # training params
        self.learn_rate = 5e-4
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temperature = 1.0  # the temperature param
        self.cpuct = 5
        self.buffer_size = 10000
        self.batch_size = 512
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 10
        self.kl_targ = 0.02
        self.check_freq = 50
        self.best_win_ratio = 0.0
        self.game_batch_num = 4000
        self.loss_dict = {}
        self.loss_hold = 50
        
        self.real_mcts_simulation_times = 400
        self.pure_mcts_simulation_times = 1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_length,
                                                   self.num_history,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_length,
                                                   self.num_history)
# =============================================================================
#         deepcopy self.chess or not???????????????????????????????????????????
# =============================================================================
        self.mcts_player = real_mcts(self.chess,
                            self.policy_value_net.policy_value,
                            self.cpuct,
                            self.real_mcts_simulation_times,
                            self.temperature,
                            self.num_history,
                            True)

예제 #27

0

파일 보기

    def __init__(self, init_model=None):

        # params of the board and the game
        self.game = Game()

        # training params
        self.config=TrainConfig()
        self.data_buffer = deque(maxlen=self.config.buffer_size)
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet()

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.config.c_puct,
                                      n_playout=self.config.n_playout,
                                      is_selfplay=1)

예제 #28

0

파일 보기

파일: train.py 프로젝트: Vendeloeranu/AlphaZero_Gomoku

 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #29

0

파일 보기

파일: RLtrainGPU.py 프로젝트: zwdnet/FiveZiQi

 def __init__(self, init_model=None):
     # 棋盘数据
     self.board_width = 8
     self.board_height = 8
     # self.n_in_row = 5
     self.board = chessboard(row=self.board_width, col=self.board_height)
     # 训练参数
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0
     self.temp = 1.0
     self.n_playout = 400  # 每次模拟次数
     self.c_puct = 5
     self.buffer_size = 10000000
     self.batch_size = 512  # 每批样本量
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # 每次更新前迭代次数
     self.kl_targ = 0.02
     self.check_freq = 2
     # 自我对弈次数
     self.game_batch_num = 1000
     self.best_win_ratio = 0.0
     # 纯蒙特卡罗树搜索，用来作为基准
     self.pure_mcts_playout_num = 400
     # 有预训练模型的情况
     if init_model:
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model,
                                                use_gpu=True)
     else:
         # 从头开始训练
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                use_gpu=True)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #30

0

파일 보기

파일: mcts_player.py 프로젝트: Sword-holder/alpha_quoridor

def test_playout():
    from quoridor import Quoridor
    from policy_value_net import PolicyValueNet
    c_puct = 5
    n_playout = 400
    policy_value_net = PolicyValueNet(model_file=None, use_gpu=True)
    mcts = MCTS(policy_value_net.policy_value_fn,
                c_puct=c_puct,
                n_playout=n_playout)
    q = Quoridor()
    acts, act_probs = mcts.get_move_probs(q)
    print(acts)
    print(act_probs)

예제 #31

0

파일 보기

 def __init__(self):
     # params of the board and the game
     self.board_width = 9
     self.board_height = 9
     self.board = Board(width=self.board_width,
                        height=self.board_height)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 800  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_loss = None
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     init_model = 'checkpoint/current_policy.model'
     if os.path.isfile(init_model + '.index'):
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

예제 #32

0

파일 보기

파일: train.py 프로젝트: Vendeloeranu/AlphaZero_Gomoku

class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(np.flipud(
                    mcts_porb.reshape(self.board_height, self.board_width)), i)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                    state_batch,
                    mcts_probs_batch,
                    winner_batch,
                    self.learn_rate*self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(np.sum(old_probs * (
                    np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                    axis=1)
            )
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}"
               ).format(kl,
                        self.lr_multiplier,
                        loss,
                        entropy,
                        explained_var_old,
                        explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                self.pure_mcts_playout_num,
                win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                        i+1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model,
                # and save the model params
                if (i+1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i+1))
                    win_ratio = self.policy_evaluate()
                    self.policy_value_net.save_model('./current_policy.model')
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model('./best_policy.model')
                        if (self.best_win_ratio == 1.0 and
                                self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')