示例#1
0
def main(debug=False):
    
    model_file = os.path.join(curr_dir, "../model/best_model_15_5.pth")
    policy_value_net = PolicyValueNet(size, model_file=model_file)

    context = zmq.Context()
    socket = context.socket(zmq.REP)
    socket.bind("tcp://*:5555")
    print("Server start on 5555 port")
    while True:
        message = socket.recv()
        try:
            message = message.decode('utf-8')
            actions = json.loads(message)
            print("Received: %s" % message)

            start = datetime.now()
            mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=c_puct, n_playout=n_playout, is_selfplay=0)
            # result = predict
            game = FiveChess(size=size, n_in_row=n_in_row)
            for act in actions:
                step=(act[0],act[1])
                game.step_nocheck(step)

            action, value = mcts_player.get_action(game, return_value=1)

            result = {"action":action, "value": value}

            print(result)

            print('time used: {} sec'.format((datetime.now() - start).total_seconds()))
            socket.send_string(json.dumps(result, ensure_ascii=False))
        except Exception as e:
            traceback.print_exc()
            socket.send_string(json.dumps({"error":str(e)}, ensure_ascii=False))
示例#2
0
    def policy_evaluate(self):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        player = MCTSPlayer(self.policy_value_net.policy_value,
                            c_puct=self.c_puct,
                            n_playout=30)
        environment = Molecule(["C", "O", "N"],
                               init_mol=self.mol,
                               allow_removal=True,
                               allow_no_modification=False,
                               allow_bonds_between_rings=False,
                               allowed_ring_sizes=[5, 6],
                               max_steps=10,
                               target_fn=None,
                               record_path=False)
        environment.initialize()
        environment.init_qed = QED.qed(Chem.MolFromSmiles(self.mol))

        moves, fp, _S_P, _Qs = player.get_action(environment,
                                                 temp=self.temp,
                                                 return_prob=1,
                                                 rand=False)

        return moves, _S_P, _Qs
示例#3
0
def run():
    curr_dir = os.path.dirname(os.path.abspath(__file__))
    model_dir = os.path.join(curr_dir, './model/')
    model_file = os.path.join(model_dir, 'model.pth')

    try:
        agent = Agent()
        # agent.limit_piece_count = 8
        # agent.limit_max_height = 10
        env = TetrominoEnv(agent.tetromino)
        # 神经网络的价值策略
        net_policy = PolicyValueNet(10, 20, 5, model_file=model_file)
        mcts_ai_player = MCTSPlayer(net_policy.policy_value_fn,
                                    c_puct=1,
                                    n_playout=64)
        # agent.start_play(mcts_ai_player, env)
        while not agent.terminal:
            if agent.curr_player == 0:
                # act_probs, value = net_policy.policy_value_fn(agent)
                # act = max(act_probs,  key=lambda act_prob: act_prob[1])[0]
                # print(act, act_probs, value)
                act = mcts_ai_player.get_action(agent)
            else:
                act = 4
            agent.step(act, env)

        agent.print()
    except KeyboardInterrupt:
        print('quit')
示例#4
0
    def collect_selfplay_data(self, i):
        """收集自我对抗数据用于训练"""
        # 使用MCTS蒙特卡罗树搜索进行自我对抗
        logging.info("TRAIN Self Play starting ...")
        agent = Agent(size, n_in_row, is_shown=0)
        # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家
        if i % 2 == 0:
            mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                     c_puct=self.c_puct,
                                     n_playout=self.n_playout,
                                     is_selfplay=1)
            pure_mcts_player = None
            mcts_player.mcts._limit_max_var = False
        else:
            if os.path.exists(best_model_file):
                best_policy_value_net = PolicyValueNet(
                    size, model_file=best_model_file)
            else:
                best_policy_value_net = self.policy_value_net
            mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                     c_puct=self.c_puct,
                                     n_playout=self.n_playout,
                                     is_selfplay=1)
            pure_mcts_player = MCTSPlayer(
                best_policy_value_net.policy_value_fn,
                c_puct=self.c_puct + 0.5,
                n_playout=self.n_playout,
                is_selfplay=1)
            mcts_player.mcts._limit_max_var = False
            pure_mcts_player.mcts._limit_max_var = False

        # 开始下棋
        winner, play_data = agent.start_self_play(mcts_player,
                                                  pure_mcts_player,
                                                  temp=self.temp)
        agent.game.print()

        if winner is None or play_data is None:
            print("give up this agent")
            return

        if pure_mcts_player != None:
            if winner == mcts_player.player:
                self.c_puct_win[0] = self.c_puct_win[0] + 1
            else:
                self.c_puct_win[1] = self.c_puct_win[1] + 1

        play_data = list(play_data)[:]
        # 采用翻转棋盘来增加样本数据集
        play_data = self.get_equi_data(play_data)
        logging.info("Self Play end. length:%s saving ..." % len(play_data))
        logging.info("c_puct:{}/{} = {}/{}".format(self.c_puct,
                                                   self.c_puct + 0.5,
                                                   self.c_puct_win[0],
                                                   self.c_puct_win[1]))

        # 保存训练数据
        for obj in play_data:
            self.save_wait_data(obj)
示例#5
0
	def initPlayers(self):
		self.width = 9
		self.height = 9
		self.board = Board(width=self.width, height=self.height, n_in_row=5)
		self.mcts_player = MCTSPlayer(c_puct=5, n_playout=1000)
		self.human_player = HumanPlayer()

		self.start_player = 0	# 0 - human, 1 - mcts_player

		self.board.init_board(self.start_player)
		p1, p2 = self.board.players
		self.human_player.set_player_id(p1)
		self.mcts_player.set_player_id(p2)
		self.players = {p2: self.mcts_player, p1: self.human_player}
		self.board.show(self.human_player.playerId, self.mcts_player.playerId)
示例#6
0
 def init(self):
     self.dataset = list()
     computerA = MCTSPlayer(value_function=self.ai.value_function, 
                     c_puct=self.c_puct, 
                     n_playout=self.n_playout, 
                     is_selfplay=True,
                     role='Self_A',
                     verbose=self.verbose)
     computerB = MCTSPlayer(value_function=self.ai.value_function, 
                     c_puct=self.c_puct, 
                     n_playout=self.n_playout, 
                     is_selfplay=True,
                     role='Self_B',
                     verbose=self.verbose)
     self.gameenigne = GameEngine(playerA=computerA,playerB=computerB,verbose=self.verbose)
示例#7
0
文件: train.py 项目: BiwefC/Gomoku
 def __init__(self, init_model=True):
     self.config = GomokuConfig()
     # params of the board and the game
     self.board = GomokuBase(width=self.config.board_width,
                             height=self.config.board_height,
                             n_to_win=self.config.n_to_win,
                             use_forbidden=self.config.use_forbidden)
     self.game = GomokuServer(self.board)
     # training params
     self.data_buffer = deque(maxlen=self.config.buffer_size)
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     #self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(
             self.config.board_width,
             self.config.board_height,
             model_file=self.config.model_path)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.config.board_width,
                                                self.config.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.config.c_puct,
                                   n_playout=self.config.n_playout,
                                   is_selfplay=True)
示例#8
0
文件: test.py 项目: one-leaf/pytorch
def run():
    size = 15  # 棋盘大小
    n_in_row = 5  # 几子连线

    curr_dir = os.path.dirname(os.path.abspath(__file__))
    model_dir = os.path.join(curr_dir, './model/')
    model_file = os.path.join(model_dir, 'model_%s_%s.pth' % (size, n_in_row))

    try:
        agent = Agent(size=size, n_in_row=n_in_row)
        # ############### human VS AI ###################

        # 神经网络的价值策略
        net_policy = PolicyValueNet(size, model_file=model_file)
        mcts_ai_player = MCTSPlayer(net_policy.policy_value_fn,
                                    c_puct=4,
                                    n_playout=500,
                                    is_selfplay=0)

        # 纯MCTS玩家
        # mcts_player = MCTSPurePlayer(c_puct=5, n_playout=2000)

        # 人类玩家
        human = Human(agent, is_show=1)

        # 设置 start_player=0 AI先走棋
        agent.start_play(mcts_ai_player, human, start_player=0)
        agent.game.print()
        agent.env.close()
        # agent.start_play(human, human, start_player=0 if random.random()>0.5 else 1)
    except KeyboardInterrupt:
        print('quit')
示例#9
0
    def __init__(self, board_width, board_height, net_params = None):

        # init network parameters
        self.learning_rate = 5e-3
        self.l2_const = 1e-4 #coef of l2 penalty
        self.lr_multiplier = 1.0
        self.temp = 1.0 #temporary parameter
        self.n_playout = 400 # number of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512 # number of mini-batch 
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5 #number of train step for each update
        self.kl_targ = 0.025
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = 1000
        
        # initial env
        self.board = Board()
        self.game = Game(self.board)
        self.board_width = board_width
        self.board_height = board_height

        self.create_policy_value_net()
        self._loss_train_op()
        
        #init mcts player
        self.mcts_player = MCTSPlayer(self.policy_value_fn, self.board.get_current_player(), c_puct = self.c_puct, n_playout = self.n_playout, is_selfplay = 1)
示例#10
0
 def __init__(self, init_model=None):
     # 棋盘参数
     self.game = Quoridor()
     # 训练参数
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # 适应性调节学习速率
     self.temp = 1.0
     self.n_playout = 400
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 128  # 取1 测试ing
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     self.pure_mcts_playout_num = 1000
     if init_model:
         self.policy_value_net = PolicyValueNet(model_file=init_model)
     else:
         self.policy_value_net = PolicyValueNet()
     # 设置电脑玩家信息
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
示例#11
0
    def __init__(self, init_model=None):
        self.game = Quoridor()


        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0
        self.n_playout = 200
        self.c_puct = 5
        self.buffer_size = 10000
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.kl_targ = 0.02
        self.check_freq = 10
        self.game_batch_num = 1000
        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = 1000

        self.old_probs = 0
        self.new_probs = 0

        self.first_trained = False

        if init_model:
            self.policy_value_net = PolicyValueNet(model_file=init_model)
        else:
            self.policy_value_net = PolicyValueNet()

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct,
                                      n_playout=self.n_playout, is_selfplay=1)
示例#12
0
 def __init__(self, init_model=None, size=8):
     # 棋盘大小 8*8, 5个子连起来
     self.board_width = size
     self.board_height = size
     self.n_in_row = 5  # n子相连
     self.policy_evaluate_size = 10  # 策略评估胜率时的模拟对局次数
     self.game_batch_num = 10000  # selfplay对战次数
     self.batch_size = 512  # data_buffer中对战次数超过n次后开始启动模型训练
     self.check_freq = 50  # 每对战n次检查一次当前模型vs旧模型胜率
     self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # 基于KL的自适应学习率
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # 每个动作的模拟次数
     self.buffer_size = 10000  # cache对战记录个数
     self.data_buffer = deque(maxlen=self.buffer_size)  # 完整对战历史记录,用于训练
     self.play_batch_size = 1
     self.epochs = 5  # 每次更新策略价值网络的训练步骤数
     self.kl_targ = 0.02  # 策略价值网络KL值目标
     self.best_win_ratio = 0.0
     # 纯MCTS的模拟数,用于评估策略模型
     self.pure_mcts_playout_num = 1000 # 1000  # 用户纯MCTS构建初始树时的随机走子步数
     self.c_puct = 5  # MCTS child搜索深度
     if init_model:
         # 使用一个训练好的策略价值网络
         self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model)
     else:
         # 使用一个新的的策略价值网络
         self.policy_value_net = PolicyValueNet(self.board_width, self.board_height)
     # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
示例#13
0
 def __init__(self):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = ShogiBoard()
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     self.check_freq = 50
     self.game_batch_num = 3000
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     # start training from a given policy-value net
     #        policy_param = pickle.load(open('current_policy.model', 'rb'))
     #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     # start training from a new policy-value net
     self.policy_value_net = PolicyValueNet()
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
示例#14
0
    def policy_evaluate(self):
        """
        策略胜率评估:当前模型与最佳模型对战n局看胜率
        """
        # 如果不存在最佳模型,直接将当前模型保存为最佳模型
        if not os.path.exists(best_model_file):
            self.policy_value_net.save_model(best_model_file)
            return

        # 当前训练好的模型
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        if self.best_policy_value_net is None:
            self.best_policy_value_net = PolicyValueNet(
                size, model_file=best_model_file)
        best_mcts_player = MCTSPlayer(
            self.best_policy_value_net.policy_value_fn,
            c_puct=self.c_puct,
            n_playout=self.n_playout)

        current_mcts_player.mcts._limit_max_var = False
        best_mcts_player.mcts._limit_max_var = False

        agent = Agent(size, n_in_row, is_shown=0)
        winner, play_data = agent.start_self_evaluate(
            current_mcts_player,
            best_mcts_player,
            temp=self.temp,
            start_player=sum(self.best_win) % 2)
        if winner == current_mcts_player.player:
            self.best_win[0] = self.best_win[0] + 1
            print("Curr Model Win!", "win:", self.best_win[0], "lost",
                  self.best_win[1])
        if winner == best_mcts_player.player:
            self.best_win[1] = self.best_win[1] + 1
            print("Curr Model Lost!", "win:", self.best_win[0], "lost",
                  self.best_win[1])
        agent.game.print()

        # 保存训练数据
        play_data = list(play_data)[:]
        play_data = self.get_equi_data(play_data)
        logging.info("Eval Play end. length:%s saving ..." % len(play_data))
        for obj in play_data:
            self.save_wait_data(obj)
示例#15
0
def main():
    width = 15
    height = 15
    net = PolicyValueNet(width, height)
    player = MCTSPlayer(net, n_playout=1000, is_selfplay=True)
    trainer = Trainer(width, height, net)
    for i in range(1500):
        print("episode " + str(i) + "...\n")
        board = NewBoard(width, height)
        trainer.simulate(board, player)
        trainer.train()
示例#16
0
class Engine():
    def __init__(self, time=5, policy=None):
        self.player = MCTSPlayer(time, policy)

    def set_komi(self, komi):
        go.KOMI = komi

    def set_size(self, size):
        go.init(size)

    def clear(self):
        self.player.clear()

    def debug(self, info=''):
        print(self.player.debug_info + info)

    def move(self, color, vertex=None):
        legal = True

        if vertex is None:
            vertex = self.player.move()
            legal = vertex is not None
        else:
            legal = go.move(vertex)

        if legal:
            take = go.get_take(go.POSITION)
            return go.toJI(go.POSITION.vertex), {go.toJI(v) for v in take}
        else:
            return None, {}

    def get_score(self):
        return go.POSITION.result()

    def save(self):
        return go.POSITION.toJSON()

    def load(self, str):
        go.POSITION.fromJSON(str)
示例#17
0
def run_play(cmd_line_args=None):

    # Set initial conditions
    policy = simplenet.PolicyValue(simplenet.PolicyValue.create_network())
    policy.load()

    boardsize = policy.model.input_shape[-1]
    best_player = MCTSPlayer(policy.eval_value_state,
                             policy.eval_policy_state,
                             n_playout=10,
                             evaluating=True)
    human_player = Human(boardsize)
    run_a_game(best_player, human_player, boardsize)
示例#18
0
文件: test.py 项目: one-leaf/pytorch
def run():
    curr_dir = os.path.dirname(os.path.abspath(__file__))
    model_dir = os.path.join(curr_dir, './model/')
    model_file = os.path.join(model_dir, 'model-cnn.pth')

    try:
        agent = Agent()
        agent.limit_piece_count = 0
        agent.limit_max_height = 10
        # env = TetrominoEnv(agent.tetromino)
        # 神经网络的价值策略
        net_policy = PolicyValueNet(10, 20, 5, model_file=model_file)
        mcts_ai_player = MCTSPlayer(net_policy.policy_value_fn,
                                    c_puct=1,
                                    n_playout=64)
        # agent.start_play(mcts_ai_player, env)
        while not agent.terminal:
            act = mcts_ai_player.get_action(agent)
            # agent.step(act, env)
            agent.step(act)
            print(agent.get_availables())
            agent.print2(True)
    except KeyboardInterrupt:
        print('quit')
示例#19
0
    def __init__(self, init_model=None):
        # 设置棋盘和游戏的参数
        '''
        self.node1 = node({'cpu':20, 'memory':20, 'gpu':0})
        self.node2 = node({'cpu':20, 'memory':20, 'gpu':0})
        self.node3 = node({'cpu':50, 'memory':50, 'gpu':50})
        self.node_dict = {'node1':self.node1, 'node2':self.node2, 'node3':self.node3}
        self.data_name = 'gpu'
        self.c_puct_list = [0.03,0.3,3]
        self.n_job_thread_list = [0,5]
        self.probability_1_list = [0,0.03,0.3]
        self.probability_2_list = [0.3,0.6,0.9]
        '''
        self.node1 = node({'cpu': 30, 'memory': 30, 'gpu': 30, 'fpga': 0})
        self.node2 = node({'cpu': 30, 'memory': 30, 'gpu': 0, 'fpga': 30})
        self.node3 = node({'cpu': 50, 'memory': 50, 'gpu': 50, 'fpga': 50})
        self.node4 = node({'cpu': 30, 'memory': 30, 'gpu': 0, 'fpga': 0})
        self.node5 = node({'cpu': 30, 'memory': 30, 'gpu': 0, 'fpga': 0})
        #按比例应该是越大越明显
        self.node_dict = {
            'node1': self.node1,
            'node2': self.node2,
            'node3': self.node3,
            'node4': self.node4,
            'node5': self.node5
        }
        self.data_name = 'fpga_gpu'
        self.c_puct_list = [0.03, 0.3, 3]
        self.n_job_thread_list = [0, 5]
        self.probability_1_list = [0, 0.03, 0.3]
        self.probability_2_list = [0.3, 0.6, 0.9]

        #self.weight = {'cpu':0.3, 'memory':0.2, 'gpu':0.5}
        self.weight = None
        self.state = State(self.node_dict)
        self.game = Game(self.node_dict, self.weight)
        # 设置训练参数
        self.n_playout = 1000  # 每下一步棋,模拟的步骤数
        self.c_puct = 1  # exploitation和exploration之间的折中系数
        self.game_batch_num = 3
        self.n_job_thread = 6  #0
        self.probability_1 = 0  #0
        self.probability_2 = 0.2  #0.2
        #self.path = r'D:\科研\论文\High effient resource scheduling for cloud based on modified MCTS\programing\parameter_check_on_have_fpga.pkl'
        # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练
        self.mcts_player = MCTSPlayer(c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
示例#20
0
def run():
    n = 5
    width, height = 8, 8
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)
        mcts_player = MCTSPlayer(c_puct=5, n_playout=400)
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=0)
        # 每次对弈结束,把state-node pair 存进去
        store_object(mcts_player.state_node_pairs, "Data")
    except KeyboardInterrupt:

        print('\n\rquit')
示例#21
0
 def policy_evaluate(self, n_games=10):
     """
     策略胜率评估:模型与纯MCTS玩家对战n局看胜率
     """
     # AlphaGo Zero风格的MCTS玩家(使用策略价值网络来指导树搜索和评估叶节点)
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout)
     # 纯MCTS玩家
     pure_mcts_player = MCTSPurePlayer(c_puct=5, n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):  # 对战
         winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0)
         win_cnt[winner] += 1
     # 胜率
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     logging.info("TRAIN Num_playouts:{}, win: {}, lose: {}, tie:{}, win_ratio:{}".format(self.pure_mcts_playout_num,
                                                                            win_cnt[1], win_cnt[2], win_cnt[-1], win_ratio))
     return win_ratio
示例#22
0
 def __init__(self, mol=None, init_model=None):
     # params of the board and the game
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 30  # num of simulations for each move
     self.c_puct = 1
     self.buffer_size = 200
     self.batch_size = 200  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.epochs = 50  # num of train_steps for each update
     self.kl_targ = 0.2
     self.check_freq = 5
     self.mol = mol
     self.play_batch_size = 1
     self.game_batch_num = 15
     self.in_dim = 1024
     self.n_hidden_1 = 1024
     self.n_hidden_2 = 1024
     self.out_dim = 1
     self.output_smi = []
     self.output_qed = []
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.in_dim,
                                                self.n_hidden_1,
                                                self.n_hidden_2,
                                                self.out_dim,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.in_dim,
                                                self.n_hidden_1,
                                                self.n_hidden_2,
                                                self.out_dim)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
示例#23
0
    def collect_selfplay_data(self):
        """收集自我对抗数据用于训练"""       
        # 使用MCTS蒙特卡罗树搜索进行自我对抗
        logging.info("TRAIN Self Play starting ...")
        # 游戏代理
        agent = Agent()

        # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家
        mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
        # 开始下棋
        winer, play_data = agent.start_self_play(mcts_player, temp=self.temp)
        play_data = list(play_data)[:]
        episode_len = len(play_data)

        # 把翻转棋盘数据加到数据集里
        # play_data = self.get_equi_data(play_data)
        logging.info("TRAIN Self Play end. length:%s saving ..." % episode_len)
        # 保存对抗数据到data_buffer
        for obj in play_data:
            self.dataset.save(obj)
示例#24
0
文件: play.py 项目: BiwefC/Gomoku
 def __init__(self, player1=GomokuPlayer.Human, player2=GomokuPlayer.Human):
     # params of the board and the game
     self.config = GomokuConfig()
     self.board = GomokuBase(width=self.config.board_width,
                             height=self.config.board_height,
                             n_to_win=self.config.n_to_win)
     self.game = GomokuServer(self.board, player1, player2)
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     # start training from an initial policy-value net
     if player1 == GomokuPlayer.AI or player2 == GomokuPlayer.AI:
         self.policy_value_net = PolicyValueNet(
             self.config.board_width,
             self.config.board_height,
             model_file=self.config.model_path)
         self.mcts_player = MCTSPlayer(
             self.policy_value_net.policy_value_fn,
             c_puct=self.config.c_puct,
             n_playout=self.config.n_playout_play,
             is_selfplay=False)
 def __init__(self, size=(8, 8), init_model=None):
     # params of the board and the game
     print(size)
     self.board_width = size[1]
     self.board_height = size[0]
     self.board = GomokuBoard(size=(self.board_width, self.board_height))
     self.game = GomokuGame(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 3000
     self.best_win_ratio = 0.0
     self.all_loss = []
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
示例#26
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing games against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         print("train-policy_evaluate: game = %d" % (i))
         winner = start_play(self.board,
                             current_mcts_player,
                             pure_mcts_player,
                             startPlayer=i % 2)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0]))
     return win_ratio
示例#27
0
from game import AlternateTurnGame
from helper import User, RandomPlayer
from mcts import MCTSPlayer

from tictactoe import TTTState
from connectn import ConnectNState

board1 = TTTState()
game1 = AlternateTurnGame(2, board1)
board2 = ConnectNState()
game2 = AlternateTurnGame(2, board2)

#p1 = MCTSPlayer()
players = [RandomPlayer(), RandomPlayer()]
players = [MCTSPlayer(), MCTSPlayer()]
print(game1.play_games(1, players))

#print(game2.play_games(1000, [RandomPlayer()]*2))
#print(game1.play_games(1000, [RandomPlayer()]*2))
示例#28
0
 def __init__(self, time=5, policy=None):
     self.player = MCTSPlayer(time, policy)
示例#29
0
    model_file = '{}/model/best_policy_{}x{}.model'.format(CUR_PATH, size, size)
    try:
        # 初始化棋盘
        board = Board(width=size, height=size, n_in_row=5)
        game = Game(board)

        # 初始化AI棋手
        best_policy = PolicyValueNet(size, size, model_file=model_file)
        """
        # 使用numpy加载训练好的模型(仅限Theano/Lasagne训练出的模型)
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(size, size, policy_param)
        """
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=900)

        # 纯MCTS棋手
        # mcts_player = MCTSPurePlayer(c_puct=5, n_playout=4000)

        # 初始化人类棋手,输入移动命令的格式: 2,3
        human_player = HumanPlayer()

        # 启动游戏(start_player=0人类先/1机器先)
        game.start_play(human_player, mcts_player, start_player=1, is_shown=1)

    except KeyboardInterrupt:
        print('\n\rquit')
示例#30
0
    def collect_selfplay_data(self):
        """收集自我对抗数据用于训练"""
        # 使用MCTS蒙特卡罗树搜索进行自我对抗
        logging.info("TRAIN Self Play starting ...")
        # 游戏代理
        agent = Agent()

        # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家
        mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                 c_puct=self.c_puct,
                                 n_playout=self.n_playout,
                                 is_selfplay=1)
        for _ in range(3):
            # 开始下棋
            reward, piececount, agentcount, play_data = agent.start_self_play(
                mcts_player, temp=self.temp)
            play_data = list(play_data)[:]
            episode_len = len(play_data)

            # 把翻转棋盘数据加到数据集里
            # play_data = self.get_equi_data(play_data)
            logging.info("TRAIN Self Play end. length:%s saving ..." %
                         episode_len)
            # 保存对抗数据到data_buffer
            for obj in play_data:
                filename = "{}.pkl".format(uuid.uuid1())
                savefile = os.path.join(data_wait_dir, filename)
                pickle.dump(obj, open(savefile, "wb"))
                # self.dataset.save(obj)

            if agent.limit_max_height == 10:
                jsonfile = os.path.join(data_dir, "result.json")
                if os.path.exists(jsonfile):
                    result = json.load(open(jsonfile, "r"))
                else:
                    result = {"reward": 0, "steps": 0, "agent": 0}
                if "1k" not in result:
                    result["1k"] = {"reward": 0, "steps": 0, "agent": 0}
                result["reward"] = result["reward"] + reward
                result["steps"] = result["steps"] + piececount
                result["agent"] = result["agent"] + agentcount
                result["1k"]["reward"] = result["1k"]["reward"] + reward
                result["1k"]["steps"] = result["1k"]["steps"] + piececount
                result["1k"]["agent"] = result["1k"]["agent"] + agentcount

                if result["agent"] > 0 and result["agent"] % 100 <= 1:
                    result[str(result["agent"])] = {
                        "reward":
                        result["1k"]["reward"] / result["1k"]["agent"],
                        "steps": result["1k"]["steps"] / result["1k"]["agent"]
                    }

                if result["agent"] > 0 and result["agent"] % 1000 == 0:

                    # 额外保存
                    steps = round(result["1k"]["steps"] /
                                  result["1k"]["agent"])
                    model_file = os.path.join(model_dir,
                                              'model_%s.pth' % steps)
                    self.policy_value_net.save_model(model_file)

                    for key in list(result.keys()):
                        if key.isdigit():
                            c = int(key)
                            if c % 1000 > 10:
                                del result[key]
                    result["1k"] = {"reward": 0, "steps": 0, "agent": 0}

                json.dump(result, open(jsonfile, "w"), ensure_ascii=False)

            if reward >= 1: break