Пример #1
0
def action_model(model_type, model, board, player):
    """
    :param model_type: model type
    :param model: policy model or value model, or else
    :param board: a numpy array with size (15 x 15)
    :param player: a player
    :return:
    """
    if player == "black":
        player = RenjuGame.PLAYER_BLACK
    else:
        player = RenjuGame.PLAYER_WHITE
    position = RenjuGame(board=board, player=player)
    if model_type == "policy_dl" or model_type == "policy_rl":
        state = position.get_states()
        action = model.predict([state])[0]
    elif model_type == "policy_rollout":
        # state = position.get_patterns()
        state = position.get_states(flatten=True)
        action = model.predict([state])[0]
    elif model_type == "value_net":
        state = position.get_states(player_plane=True)
        action = model.predict([state])[0]
    else:
        logger.error("not support model type=%s" % model_type)
        action = None
    return action
Пример #2
0
def action_model(model_type, model, board, player):
    """
    :param model_type: model type
    :param model: policy model or value model, or else
    :param board: a numpy array with size (15 x 15)
    :param player: a player
    :return:
    """
    if player == "black":
        player = RenjuGame.PLAYER_BLACK
    else:
        player = RenjuGame.PLAYER_WHITE
    position = RenjuGame(board=board, player=player)
    if model_type == "policy_dl" or model_type == "policy_rl":
        state = position.get_states()
        action = model.predict([state])[0]
    elif model_type == "policy_rollout":
        # state = position.get_patterns()
        state = position.get_states(flatten=True)
        action = model.predict([state])[0]
    elif model_type == "value_net":
        state = position.get_states(player_plane=True)
        action = model.predict([state])[0]
    else:
        logger.error("not support model type=%s" % model_type)
        action = None
    return action
Пример #3
0
 def loss_function(self, optimizer, learn_rate):
     self.tf_var["act"] = tf.placeholder(
         "float", [None, self.board_size * self.board_size])
     self.tf_var["target"] = tf.placeholder("float", [None])
     predict_act = tf.reduce_sum(tf.mul(self.tf_var["out"],
                                        self.tf_var["act"]),
                                 reduction_indices=1)
     self.tf_var["cost"] = tf.reduce_mean(
         tf.square(self.tf_var["target"] - predict_act))
     if optimizer == "sgd":
         self.tf_var["optimizer"] = \
             tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "adam":
         self.tf_var["optimizer"] = \
             tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "rmsProb":
         self.tf_var["optimizer"] = \
             tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     else:
         logger.error("not found optimizer=%s" % optimizer, to_exit=True)
     # evaluate
     correct_pred = tf.equal(tf.argmax(self.tf_var["out"], 1),
                             tf.argmax(self.tf_var["act"], 1))
     self.tf_var["accuracy"] = tf.reduce_mean(
         tf.cast(correct_pred, tf.float32))
Пример #4
0
 def loss_function(self, optimizer, learn_rate, batch_size):
     self.tf_var["target"] = tf.placeholder("float", [None])
     self.tf_var["cost"] = tf.reduce_sum(tf.pow(self.tf_var["out"] - self.tf_var["target"], 2) / (2 * batch_size))
     if optimizer == "sgd":
         self.tf_var["optimizer"] = tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "adam":
         self.tf_var["optimizer"] = tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "rmsProb":
         self.tf_var["optimizer"] = tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     else:
         logger.error("not found optimizer=%s" % optimizer, to_exit=True)
Пример #5
0
 def train_value_network(self, rpc, sample_num=1000, max_time_steps=225,
                         epochs=20, batch_size=32):
     """
     :param policy_dl: policy network of deep learning
     :param policy_rl: policy network of reinforcement learning
     :return:
     """
     model_params = self.param_unserierlize(init_params={"global_step": 0, "global_epoch": 0})
     if sample_num > 0:  # create sample
         start_time = time.time()
         sample_file = "data/value_net_phase_%d_samples_%d.pkl" % (self.phase, sample_num)
         sample_games = sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=max_time_steps)
         elapsed_time = int((time.time() - start_time) * 1000)
         logger.info("sampling for value network, samples=%d, time=%d(ms)" % (sample_num, elapsed_time))
         cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
         logger.info("save sample file: %s" % sample_file)
         model_params["sample_file"] = sample_file
         self.param_serierlize(model_params)
     else:  # load old sample
         if 'sample_file' not in model_params:
             logger.error("not found sample file", to_exit=True)
         sample_games = cPickle.load(open(model_params["sample_file"], 'rb'))
     epoch_step, train_step = model_params["global_epoch"], model_params["global_step"]
     while epoch_step < (model_params["global_epoch"] + epochs):
         start_time = time.time()
         epoch_step += 1
         random.shuffle(sample_games)
         avg_loss = 0.0
         for idx in xrange(0, len(sample_games), batch_size):
             end_idx = min(len(sample_games), idx + batch_size)
             mini_samples = sample_games[idx: end_idx]
             # transform sample data
             mini_states = [sampled_game.get_states(player_plane=True) for sampled_game, _ in mini_samples]
             mini_rewards = [sampled_reward for _, sampled_reward in mini_samples]
             fetch_status = self.fit(mini_states, mini_rewards, fetch_info=True)
             _, train_step, loss = fetch_status
             avg_loss += loss
             train_step = int(train_step)
             if train_step % 20 == 0:
                 elapsed_time = int((time.time() - start_time) * 1000)
                 logger.info(
                     "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)" %
                     (self.phase, epoch_step, train_step, loss, elapsed_time))
                 start_time = time.time()
         avg_loss /= math.ceil(len(sample_games) / batch_size)
         logger.info("train value network, phase=%d, epoch=%d, avg_loss=%.6f" % (self.phase, epoch_step, avg_loss))
         if epoch_step % 5 == 0:  # save model
             model_params["global_step"] = train_step
             model_params["global_epoch"] = epoch_step
             self.param_serierlize(model_params)
             model_file = self.save_model("value_net_phase_%d" % self.phase, global_step=model_params["global_step"])
             logger.info("save value network model, file=%s" % model_file)
Пример #6
0
def load_model(args, model_type, model_file=None):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    pattern_features = args.pattern_features
    if model_type == "policy_dl":
        model = PolicyDLNetwork(policy_planes, corpus, args, filters=args.policy_dl_filters,
                                board_size=args.board_size,
                                model_dir=args.policy_dl_models_dir, device="gpu", gpu=args.policy_dl_gpu,
                                optimizer=args.policy_dl_optimizer,
                                learn_rate=args.policy_dl_learn_rate,
                                distributed_train=False,
                                )
    elif model_type == "policy_rollout":
        model = PolicyRolloutModel(policy_planes, patterns, args,
                                   board_size=args.board_size,
                                   model_dir=args.policy_rollout_models_dir, device="cpu",
                                   optimizer=args.policy_rollout_optimizer,
                                   learn_rate=args.policy_rollout_learn_rate,
                                   distributed_train=False,
                                   )
    elif model_type == "policy_rl":
        model = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters,
                                board_size=args.board_size,
                                model_dir=args.policy_rl_models_dir, device="cpu",
                                optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate,
                                distributed_train=False,
                                )
    elif model_type == "value_net":
        model = ValueNetwork(value_planes, args, phase=args.values_net_phase, filters=args.values_net_filters,
                             board_size=args.board_size,
                             model_dir=args.values_net_models_dir, device="cpu",
                             optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate,
                             )
    else:
        logger.error("unsupported model type=%s" % model_type, to_exit=True)
    # init session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                               allow_soft_placement=True,
                                               gpu_options=gpu_options))
    session.run(tf.initialize_all_variables())
    model.set_session(session)
    # restore model
    status = model.restore_model(model_file=model_file)
    if not status and model_type == "policy_rl":
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        model_file = checkpoint.model_checkpoint_path
        logger.info("successful load model file: %s" % model_file)
        model.saver.restore(session, model_file)
    return model
Пример #7
0
def train_policy_network_rl(args):
    policy_planes = args.policy_planes
    # rpc of value_net
    rpc = ModelRPC(args)
    if args.policy_rl_reset:
        # empty old rl policy network
        if os.path.exists(args.policy_rl_models_dir):
            # os.removedirs(args.policy_rl_models_dir)
            shutil.rmtree(args.policy_rl_models_dir)
        os.makedirs(args.policy_rl_models_dir)
        # read parameters from DL policy network
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            model_file = checkpoint.model_checkpoint_path
        else:
            logger.error("not found policy dl model avaliable", to_exit=True)
    else:
        model_file = None
    # init policy RL network
    policy_rl = PolicyRLNetwork(
        policy_planes,
        args,
        phase=args.policy_rl_phase,
        filters=args.policy_rl_filters,
        board_size=args.board_size,
        model_dir=args.policy_rl_models_dir,
        gpu=args.policy_rl_gpu,
        optimizer=args.policy_rl_optimizer,
        learn_rate=args.policy_rl_learn_rate,
        distributed_train=False,
    )
    # init session
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                               allow_soft_placement=True))
    session.run(tf.initialize_all_variables())
    policy_rl.set_session(session)
    # restore model if exist
    if model_file is not None:
        policy_rl.saver.restore(session, model_file)
        logger.info("load model file: %s" % model_file)
        policy_rl.save_model("policy_rl", global_step=0)
    else:
        policy_rl.restore_model()
    # train policy rl
    policy_rl.train_policy_network(rpc,
                                   batch_games=args.policy_rl_batch_games,
                                   save_step=args.policy_rl_save_step)
Пример #8
0
def play_games(args):
    player = args.player
    board_stream = args.board
    if board_stream != "":
        if not is_legal_stream(board_stream):
            logger.error("not legal board stream:[%s]" % board_stream, to_exit=True)
        board = stream_to_board(board_stream)
    else:
        board = None
    root = RenjuGame(board=board, player=player)
    rpc = ModelRPC(args)
    mcst = MCTS(rpc, visit_threshold=args.mcts_visit_threshold, virtual_loss=args.mcts_virtual_loss,
                explore_rate=args.mcts_explore_rate, mix_lambda=args.mcts_mix_lambda)
    root = mcst.simulation(root)
    node, action = mcst.decision(root)
    print board
    print "action: %d", action
Пример #9
0
 def loss_function(self, optimizer, learn_rate, batch_size):
     self.tf_var["target"] = tf.placeholder("float", [None])
     self.tf_var["cost"] = tf.reduce_sum(
         tf.pow(self.tf_var["out"] - self.tf_var["target"], 2) /
         (2 * batch_size))
     if optimizer == "sgd":
         self.tf_var["optimizer"] = tf.train.GradientDescentOptimizer(
             learn_rate).minimize(self.tf_var["cost"],
                                  global_step=self.global_step)
     elif optimizer == "adam":
         self.tf_var["optimizer"] = tf.train.AdamOptimizer(
             learn_rate).minimize(self.tf_var["cost"],
                                  global_step=self.global_step)
     elif optimizer == "rmsProb":
         self.tf_var["optimizer"] = tf.train.RMSPropOptimizer(
             learn_rate).minimize(self.tf_var["cost"],
                                  global_step=self.global_step)
     else:
         logger.error("not found optimizer=%s" % optimizer, to_exit=True)
Пример #10
0
def play_games(args):
    player = args.player
    board_stream = args.board
    if board_stream != "":
        if not is_legal_stream(board_stream):
            logger.error("not legal board stream:[%s]" % board_stream,
                         to_exit=True)
        board = stream_to_board(board_stream)
    else:
        board = None
    root = RenjuGame(board=board, player=player)
    rpc = ModelRPC(args)
    mcst = MCTS(rpc,
                visit_threshold=args.mcts_visit_threshold,
                virtual_loss=args.mcts_virtual_loss,
                explore_rate=args.mcts_explore_rate,
                mix_lambda=args.mcts_mix_lambda)
    root = mcst.simulation(root)
    node, action = mcst.decision(root)
    print board
    print "action: %d", action
Пример #11
0
 def loss_function(self, optimizer, learn_rate):
     # loss model
     self.tf_var["target"] = tf.placeholder("float", [None, self.board_size * self.board_size])
     self.tf_var["cost"] = tf.reduce_mean(
         tf.nn.softmax_cross_entropy_with_logits(self.tf_var["out"], self.tf_var["target"])
     )
     # optimizer
     if optimizer == "sgd":
         self.tf_var["optimizer"] = \
             tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "adam":
         self.tf_var["optimizer"] = \
             tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "rmsProb":
         self.tf_var["optimizer"] = \
             tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     else:
         logger.error("not found optimizer=%s" % optimizer, to_exit=True)
     # evaluate
     correct_pred = tf.equal(tf.argmax(self.tf_var["out"], 1), tf.argmax(self.tf_var["target"], 1))
     self.tf_var["accuracy"] = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
Пример #12
0
def train_policy_network_rl(args):
    policy_planes = args.policy_planes
    # rpc of value_net
    rpc = ModelRPC(args)
    if args.policy_rl_reset:
        # empty old rl policy network
        if os.path.exists(args.policy_rl_models_dir):
            # os.removedirs(args.policy_rl_models_dir)
            shutil.rmtree(args.policy_rl_models_dir)
        os.makedirs(args.policy_rl_models_dir)
        # read parameters from DL policy network
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            model_file = checkpoint.model_checkpoint_path
        else:
            logger.error("not found policy dl model avaliable", to_exit=True)
    else:
        model_file = None
    # init policy RL network
    policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters,
                                board_size=args.board_size,
                                model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu,
                                optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate,
                                distributed_train=False,
                                )
    # init session
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True))
    session.run(tf.initialize_all_variables())
    policy_rl.set_session(session)
    # restore model if exist
    if model_file is not None:
        policy_rl.saver.restore(session, model_file)
        logger.info("load model file: %s" % model_file)
        policy_rl.save_model("policy_rl", global_step=0)
    else:
        policy_rl.restore_model()
    # train policy rl
    policy_rl.train_policy_network(rpc,
                                   batch_games=args.policy_rl_batch_games,
                                   save_step=args.policy_rl_save_step)
Пример #13
0
 def loss_function(self, optimizer, learn_rate):
     # loss model
     self.tf_var["target"] = tf.placeholder(
         "float", [None, self.board_size * self.board_size])
     self.tf_var["cost"] = tf.reduce_mean(
         tf.nn.softmax_cross_entropy_with_logits(self.tf_var["out"],
                                                 self.tf_var["target"]))
     # optimizer
     if optimizer == "sgd":
         self.tf_var["optimizer"] = \
             tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "adam":
         self.tf_var["optimizer"] = \
             tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     elif optimizer == "rmsProb":
         self.tf_var["optimizer"] = \
             tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step)
     else:
         logger.error("not found optimizer=%s" % optimizer, to_exit=True)
     # evaluate
     correct_pred = tf.equal(tf.argmax(self.tf_var["out"], 1),
                             tf.argmax(self.tf_var["target"], 1))
     self.tf_var["accuracy"] = tf.reduce_mean(
         tf.cast(correct_pred, tf.float32))
Пример #14
0
 def import_RenjuNet(self, file_path):
     if not os.path.exists(file_path):
         logger.error("not found file: %s" % file_path, to_exit=True)
     # read xml file
     bs_tree = BeautifulSoup(open(file_path, 'r').read())
     games = bs_tree.find_all("game")
     # insert moves
     game_num = len(games)
     move_count = 0
     step = 0
     for game in games:
         step += 1
         gid = int(game.attrs["id"])
         moves = game.move.text.strip().replace("%20", " ").split(" ")
         if len(self.db.query("select id from renju WHERE gid=?", gid)) > 0:  # when gid exists
             continue
         renju_game = RenjuGame()
         for mid, move in enumerate(moves):
             move = move.strip()
             if move == "":
                 continue
             board_stream = board_to_stream(renju_game.board)
             player = renju_game.player
             row = ord(move[0]) - ord('a')
             col = int(move[1:]) - 1
             action = renju_game.transform_action((row, col))
             # insert
             self.db.execute("insert INTO renju (gid, mid, board, player, action) VALUES (?, ?, ?, ?, ?)",
                             gid, mid, board_stream, player, action)
             # do move
             renju_game.do_move((row, col))
         move_count += len(moves)
         if step % 100 == 0:
             print "load games= %d / %d" % (step, game_num)
     logger.info("newly insert games=%d, moves=%d" % (game_num, move_count))
     print "finish import moves"
Пример #15
0
def train_policy_network_rl_distribute(args):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    # hosts
    ps_hosts = args.ps_hosts.split(",")
    worker_hosts = args.worker_hosts.split(",")
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=args.job_name,
                             task_index=args.task_index)
    if args.job_name == "ps":
        server.join()
    elif args.job_name == "worker":
        if args.policy_rl_reset:
            # empty old rl policy network
            if os.path.exists(args.policy_rl_models_dir):
                # os.removedirs(args.policy_rl_models_dir)
                shutil.rmtree(args.policy_rl_models_dir)
            os.makedirs(args.policy_rl_models_dir)
            # read parameters from DL policy network
            checkpoint = tf.train.get_checkpoint_state(
                args.policy_dl_models_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                model_file = checkpoint.model_checkpoint_path
            else:
                logger.error("not found policy dl model avaliable",
                             to_exit=True)
        else:
            model_file = None
        # init policy RL network
        policy_rl = PolicyRLNetwork(
            policy_planes,
            args,
            phase=args.policy_rl_phase,
            filters=args.policy_rl_filters,
            board_size=args.board_size,
            model_dir=args.policy_rl_models_dir,
            gpu=args.policy_rl_gpu,
            optimizer=args.policy_rl_optimizer,
            learn_rate=args.policy_rl_learn_rate,
            distributed_train=True,
        )
        init_op = tf.initialize_all_variables()
        summary_op = tf.merge_all_summaries()

        sv = tf.train.Supervisor(is_chief=(args.task_index == 0),
                                 logdir=policy_rl.model_dir,
                                 init_op=init_op,
                                 summary_op=summary_op,
                                 saver=policy_rl.saver,
                                 global_step=policy_rl.global_step,
                                 save_model_secs=0)
        sess = sv.prepare_or_wait_for_session(server.target,
                                              config=tf.ConfigProto(
                                                  allow_soft_placement=True,
                                                  log_device_placement=True))
        sess.run(init_op)
        # Start queue runners for the input pipelines (if any).
        sv.start_queue_runners(sess)
        policy_rl.set_session(sess)
        if model_file is not None:
            policy_rl.saver.restore(sess, model_file)
            logger.info("load model file: %s" % model_file)
        else:
            policy_rl.restore_model()
        # load value network
        if args.policy_rl_phase > 1:
            value_dl = ValueNetwork(
                value_planes,
                phase=args.values_net_phase,
                filters=args.values_net_filters,
                board_size=args.board_size,
                model_dir=args.values_net_models_dir,
                gpu=args.values_net_gpu,
                optimizer=args.values_net_optimizer,
                learn_rate=args.values_net_learn_rate,
            )
        else:
            value_dl = None
        # train policy rl
        policy_rl.train_policy_network(value_dl,
                                       epochs=args.policy_rl_epochs,
                                       batch_games=args.policy_rl_batch_games,
                                       save_step=args.policy_rl_save_step)
Пример #16
0
def train_policy_network_rl_distribute(args):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    # hosts
    ps_hosts = args.ps_hosts.split(",")
    worker_hosts = args.worker_hosts.split(",")
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # Create and start a server for the local task.
    server = tf.train.Server(cluster,
                             job_name=args.job_name,
                             task_index=args.task_index)
    if args.job_name == "ps":
        server.join()
    elif args.job_name == "worker":
        if args.policy_rl_reset:
            # empty old rl policy network
            if os.path.exists(args.policy_rl_models_dir):
                # os.removedirs(args.policy_rl_models_dir)
                shutil.rmtree(args.policy_rl_models_dir)
            os.makedirs(args.policy_rl_models_dir)
            # read parameters from DL policy network
            checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                model_file = checkpoint.model_checkpoint_path
            else:
                logger.error("not found policy dl model avaliable", to_exit=True)
        else:
            model_file = None
        # init policy RL network
        policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters,
                                    board_size=args.board_size,
                                    model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu,
                                    optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate,
                                    distributed_train=True,
                                    )
        init_op = tf.initialize_all_variables()
        summary_op = tf.merge_all_summaries()

        sv = tf.train.Supervisor(is_chief=(args.task_index == 0),
                                 logdir=policy_rl.model_dir,
                                 init_op=init_op,
                                 summary_op=summary_op,
                                 saver=policy_rl.saver,
                                 global_step=policy_rl.global_step,
                                 save_model_secs=0)
        sess = sv.prepare_or_wait_for_session(server.target,
                                              config=tf.ConfigProto(allow_soft_placement=True,
                                                                    log_device_placement=True)
                                              )
        sess.run(init_op)
        # Start queue runners for the input pipelines (if any).
        sv.start_queue_runners(sess)
        policy_rl.set_session(sess)
        if model_file is not None:
            policy_rl.saver.restore(sess, model_file)
            logger.info("load model file: %s" % model_file)
        else:
            policy_rl.restore_model()
        # load value network
        if args.policy_rl_phase > 1:
            value_dl = ValueNetwork(value_planes, phase=args.values_net_phase, filters=args.values_net_filters,
                                    board_size=args.board_size,
                                    model_dir=args.values_net_models_dir, gpu=args.values_net_gpu,
                                    optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate,
                                    )
        else:
            value_dl = None
        # train policy rl
        policy_rl.train_policy_network(value_dl, epochs=args.policy_rl_epochs,
                                       batch_games=args.policy_rl_batch_games,
                                       save_step=args.policy_rl_save_step)
Пример #17
0
def load_model(args, model_type, model_file=None):
    policy_planes = args.policy_planes
    value_planes = args.value_planes
    pattern_features = args.pattern_features
    if model_type == "policy_dl":
        model = PolicyDLNetwork(
            policy_planes,
            corpus,
            args,
            filters=args.policy_dl_filters,
            board_size=args.board_size,
            model_dir=args.policy_dl_models_dir,
            device="gpu",
            gpu=args.policy_dl_gpu,
            optimizer=args.policy_dl_optimizer,
            learn_rate=args.policy_dl_learn_rate,
            distributed_train=False,
        )
    elif model_type == "policy_rollout":
        model = PolicyRolloutModel(
            policy_planes,
            patterns,
            args,
            board_size=args.board_size,
            model_dir=args.policy_rollout_models_dir,
            device="cpu",
            optimizer=args.policy_rollout_optimizer,
            learn_rate=args.policy_rollout_learn_rate,
            distributed_train=False,
        )
    elif model_type == "policy_rl":
        model = PolicyRLNetwork(
            policy_planes,
            args,
            phase=args.policy_rl_phase,
            filters=args.policy_rl_filters,
            board_size=args.board_size,
            model_dir=args.policy_rl_models_dir,
            device="cpu",
            optimizer=args.policy_rl_optimizer,
            learn_rate=args.policy_rl_learn_rate,
            distributed_train=False,
        )
    elif model_type == "value_net":
        model = ValueNetwork(
            value_planes,
            args,
            phase=args.values_net_phase,
            filters=args.values_net_filters,
            board_size=args.board_size,
            model_dir=args.values_net_models_dir,
            device="cpu",
            optimizer=args.values_net_optimizer,
            learn_rate=args.values_net_learn_rate,
        )
    else:
        logger.error("unsupported model type=%s" % model_type, to_exit=True)
    # init session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
    session = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                               allow_soft_placement=True,
                                               gpu_options=gpu_options))
    session.run(tf.initialize_all_variables())
    model.set_session(session)
    # restore model
    status = model.restore_model(model_file=model_file)
    if not status and model_type == "policy_rl":
        checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir)
        model_file = checkpoint.model_checkpoint_path
        logger.info("successful load model file: %s" % model_file)
        model.saver.restore(session, model_file)
    return model
Пример #18
0
 def train_value_network(self,
                         rpc,
                         sample_num=1000,
                         max_time_steps=225,
                         epochs=20,
                         batch_size=32):
     """
     :param policy_dl: policy network of deep learning
     :param policy_rl: policy network of reinforcement learning
     :return:
     """
     model_params = self.param_unserierlize(init_params={
         "global_step": 0,
         "global_epoch": 0
     })
     if sample_num > 0:  # create sample
         start_time = time.time()
         sample_file = "data/value_net_phase_%d_samples_%d.pkl" % (
             self.phase, sample_num)
         sample_games = sampling_for_value_network(
             rpc, sample_num, sample_file, max_time_steps=max_time_steps)
         elapsed_time = int((time.time() - start_time) * 1000)
         logger.info("sampling for value network, samples=%d, time=%d(ms)" %
                     (sample_num, elapsed_time))
         cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2)
         logger.info("save sample file: %s" % sample_file)
         model_params["sample_file"] = sample_file
         self.param_serierlize(model_params)
     else:  # load old sample
         if 'sample_file' not in model_params:
             logger.error("not found sample file", to_exit=True)
         sample_games = cPickle.load(open(model_params["sample_file"],
                                          'rb'))
     epoch_step, train_step = model_params["global_epoch"], model_params[
         "global_step"]
     while epoch_step < (model_params["global_epoch"] + epochs):
         start_time = time.time()
         epoch_step += 1
         random.shuffle(sample_games)
         avg_loss = 0.0
         for idx in xrange(0, len(sample_games), batch_size):
             end_idx = min(len(sample_games), idx + batch_size)
             mini_samples = sample_games[idx:end_idx]
             # transform sample data
             mini_states = [
                 sampled_game.get_states(player_plane=True)
                 for sampled_game, _ in mini_samples
             ]
             mini_rewards = [
                 sampled_reward for _, sampled_reward in mini_samples
             ]
             fetch_status = self.fit(mini_states,
                                     mini_rewards,
                                     fetch_info=True)
             _, train_step, loss = fetch_status
             avg_loss += loss
             train_step = int(train_step)
             if train_step % 20 == 0:
                 elapsed_time = int((time.time() - start_time) * 1000)
                 logger.info(
                     "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)"
                     % (self.phase, epoch_step, train_step, loss,
                        elapsed_time))
                 start_time = time.time()
         avg_loss /= math.ceil(len(sample_games) / batch_size)
         logger.info(
             "train value network, phase=%d, epoch=%d, avg_loss=%.6f" %
             (self.phase, epoch_step, avg_loss))
         if epoch_step % 5 == 0:  # save model
             model_params["global_step"] = train_step
             model_params["global_epoch"] = epoch_step
             self.param_serierlize(model_params)
             model_file = self.save_model(
                 "value_net_phase_%d" % self.phase,
                 global_step=model_params["global_step"])
             logger.info("save value network model, file=%s" % model_file)