def action_model(model_type, model, board, player): """ :param model_type: model type :param model: policy model or value model, or else :param board: a numpy array with size (15 x 15) :param player: a player :return: """ if player == "black": player = RenjuGame.PLAYER_BLACK else: player = RenjuGame.PLAYER_WHITE position = RenjuGame(board=board, player=player) if model_type == "policy_dl" or model_type == "policy_rl": state = position.get_states() action = model.predict([state])[0] elif model_type == "policy_rollout": # state = position.get_patterns() state = position.get_states(flatten=True) action = model.predict([state])[0] elif model_type == "value_net": state = position.get_states(player_plane=True) action = model.predict([state])[0] else: logger.error("not support model type=%s" % model_type) action = None return action
def loss_function(self, optimizer, learn_rate): self.tf_var["act"] = tf.placeholder( "float", [None, self.board_size * self.board_size]) self.tf_var["target"] = tf.placeholder("float", [None]) predict_act = tf.reduce_sum(tf.mul(self.tf_var["out"], self.tf_var["act"]), reduction_indices=1) self.tf_var["cost"] = tf.reduce_mean( tf.square(self.tf_var["target"] - predict_act)) if optimizer == "sgd": self.tf_var["optimizer"] = \ tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "adam": self.tf_var["optimizer"] = \ tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "rmsProb": self.tf_var["optimizer"] = \ tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) else: logger.error("not found optimizer=%s" % optimizer, to_exit=True) # evaluate correct_pred = tf.equal(tf.argmax(self.tf_var["out"], 1), tf.argmax(self.tf_var["act"], 1)) self.tf_var["accuracy"] = tf.reduce_mean( tf.cast(correct_pred, tf.float32))
def loss_function(self, optimizer, learn_rate, batch_size): self.tf_var["target"] = tf.placeholder("float", [None]) self.tf_var["cost"] = tf.reduce_sum(tf.pow(self.tf_var["out"] - self.tf_var["target"], 2) / (2 * batch_size)) if optimizer == "sgd": self.tf_var["optimizer"] = tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "adam": self.tf_var["optimizer"] = tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "rmsProb": self.tf_var["optimizer"] = tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) else: logger.error("not found optimizer=%s" % optimizer, to_exit=True)
def train_value_network(self, rpc, sample_num=1000, max_time_steps=225, epochs=20, batch_size=32): """ :param policy_dl: policy network of deep learning :param policy_rl: policy network of reinforcement learning :return: """ model_params = self.param_unserierlize(init_params={"global_step": 0, "global_epoch": 0}) if sample_num > 0: # create sample start_time = time.time() sample_file = "data/value_net_phase_%d_samples_%d.pkl" % (self.phase, sample_num) sample_games = sampling_for_value_network(rpc, sample_num, sample_file, max_time_steps=max_time_steps) elapsed_time = int((time.time() - start_time) * 1000) logger.info("sampling for value network, samples=%d, time=%d(ms)" % (sample_num, elapsed_time)) cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2) logger.info("save sample file: %s" % sample_file) model_params["sample_file"] = sample_file self.param_serierlize(model_params) else: # load old sample if 'sample_file' not in model_params: logger.error("not found sample file", to_exit=True) sample_games = cPickle.load(open(model_params["sample_file"], 'rb')) epoch_step, train_step = model_params["global_epoch"], model_params["global_step"] while epoch_step < (model_params["global_epoch"] + epochs): start_time = time.time() epoch_step += 1 random.shuffle(sample_games) avg_loss = 0.0 for idx in xrange(0, len(sample_games), batch_size): end_idx = min(len(sample_games), idx + batch_size) mini_samples = sample_games[idx: end_idx] # transform sample data mini_states = [sampled_game.get_states(player_plane=True) for sampled_game, _ in mini_samples] mini_rewards = [sampled_reward for _, sampled_reward in mini_samples] fetch_status = self.fit(mini_states, mini_rewards, fetch_info=True) _, train_step, loss = fetch_status avg_loss += loss train_step = int(train_step) if train_step % 20 == 0: elapsed_time = int((time.time() - start_time) * 1000) logger.info( "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)" % (self.phase, epoch_step, train_step, loss, elapsed_time)) start_time = time.time() avg_loss /= math.ceil(len(sample_games) / batch_size) logger.info("train value network, phase=%d, epoch=%d, avg_loss=%.6f" % (self.phase, epoch_step, avg_loss)) if epoch_step % 5 == 0: # save model model_params["global_step"] = train_step model_params["global_epoch"] = epoch_step self.param_serierlize(model_params) model_file = self.save_model("value_net_phase_%d" % self.phase, global_step=model_params["global_step"]) logger.info("save value network model, file=%s" % model_file)
def load_model(args, model_type, model_file=None): policy_planes = args.policy_planes value_planes = args.value_planes pattern_features = args.pattern_features if model_type == "policy_dl": model = PolicyDLNetwork(policy_planes, corpus, args, filters=args.policy_dl_filters, board_size=args.board_size, model_dir=args.policy_dl_models_dir, device="gpu", gpu=args.policy_dl_gpu, optimizer=args.policy_dl_optimizer, learn_rate=args.policy_dl_learn_rate, distributed_train=False, ) elif model_type == "policy_rollout": model = PolicyRolloutModel(policy_planes, patterns, args, board_size=args.board_size, model_dir=args.policy_rollout_models_dir, device="cpu", optimizer=args.policy_rollout_optimizer, learn_rate=args.policy_rollout_learn_rate, distributed_train=False, ) elif model_type == "policy_rl": model = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, device="cpu", optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) elif model_type == "value_net": model = ValueNetwork(value_planes, args, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, device="cpu", optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: logger.error("unsupported model type=%s" % model_type, to_exit=True) # init session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options)) session.run(tf.initialize_all_variables()) model.set_session(session) # restore model status = model.restore_model(model_file=model_file) if not status and model_type == "policy_rl": checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) model_file = checkpoint.model_checkpoint_path logger.info("successful load model file: %s" % model_file) model.saver.restore(session, model_file) return model
def train_policy_network_rl(args): policy_planes = args.policy_planes # rpc of value_net rpc = ModelRPC(args) if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork( policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) # init session session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) session.run(tf.initialize_all_variables()) policy_rl.set_session(session) # restore model if exist if model_file is not None: policy_rl.saver.restore(session, model_file) logger.info("load model file: %s" % model_file) policy_rl.save_model("policy_rl", global_step=0) else: policy_rl.restore_model() # train policy rl policy_rl.train_policy_network(rpc, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def play_games(args): player = args.player board_stream = args.board if board_stream != "": if not is_legal_stream(board_stream): logger.error("not legal board stream:[%s]" % board_stream, to_exit=True) board = stream_to_board(board_stream) else: board = None root = RenjuGame(board=board, player=player) rpc = ModelRPC(args) mcst = MCTS(rpc, visit_threshold=args.mcts_visit_threshold, virtual_loss=args.mcts_virtual_loss, explore_rate=args.mcts_explore_rate, mix_lambda=args.mcts_mix_lambda) root = mcst.simulation(root) node, action = mcst.decision(root) print board print "action: %d", action
def loss_function(self, optimizer, learn_rate, batch_size): self.tf_var["target"] = tf.placeholder("float", [None]) self.tf_var["cost"] = tf.reduce_sum( tf.pow(self.tf_var["out"] - self.tf_var["target"], 2) / (2 * batch_size)) if optimizer == "sgd": self.tf_var["optimizer"] = tf.train.GradientDescentOptimizer( learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "adam": self.tf_var["optimizer"] = tf.train.AdamOptimizer( learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "rmsProb": self.tf_var["optimizer"] = tf.train.RMSPropOptimizer( learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) else: logger.error("not found optimizer=%s" % optimizer, to_exit=True)
def loss_function(self, optimizer, learn_rate): # loss model self.tf_var["target"] = tf.placeholder("float", [None, self.board_size * self.board_size]) self.tf_var["cost"] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(self.tf_var["out"], self.tf_var["target"]) ) # optimizer if optimizer == "sgd": self.tf_var["optimizer"] = \ tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "adam": self.tf_var["optimizer"] = \ tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "rmsProb": self.tf_var["optimizer"] = \ tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) else: logger.error("not found optimizer=%s" % optimizer, to_exit=True) # evaluate correct_pred = tf.equal(tf.argmax(self.tf_var["out"], 1), tf.argmax(self.tf_var["target"], 1)) self.tf_var["accuracy"] = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def train_policy_network_rl(args): policy_planes = args.policy_planes # rpc of value_net rpc = ModelRPC(args) if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) # init session session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) session.run(tf.initialize_all_variables()) policy_rl.set_session(session) # restore model if exist if model_file is not None: policy_rl.saver.restore(session, model_file) logger.info("load model file: %s" % model_file) policy_rl.save_model("policy_rl", global_step=0) else: policy_rl.restore_model() # train policy rl policy_rl.train_policy_network(rpc, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def loss_function(self, optimizer, learn_rate): # loss model self.tf_var["target"] = tf.placeholder( "float", [None, self.board_size * self.board_size]) self.tf_var["cost"] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(self.tf_var["out"], self.tf_var["target"])) # optimizer if optimizer == "sgd": self.tf_var["optimizer"] = \ tf.train.GradientDescentOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "adam": self.tf_var["optimizer"] = \ tf.train.AdamOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) elif optimizer == "rmsProb": self.tf_var["optimizer"] = \ tf.train.RMSPropOptimizer(learn_rate).minimize(self.tf_var["cost"], global_step=self.global_step) else: logger.error("not found optimizer=%s" % optimizer, to_exit=True) # evaluate correct_pred = tf.equal(tf.argmax(self.tf_var["out"], 1), tf.argmax(self.tf_var["target"], 1)) self.tf_var["accuracy"] = tf.reduce_mean( tf.cast(correct_pred, tf.float32))
def import_RenjuNet(self, file_path): if not os.path.exists(file_path): logger.error("not found file: %s" % file_path, to_exit=True) # read xml file bs_tree = BeautifulSoup(open(file_path, 'r').read()) games = bs_tree.find_all("game") # insert moves game_num = len(games) move_count = 0 step = 0 for game in games: step += 1 gid = int(game.attrs["id"]) moves = game.move.text.strip().replace("%20", " ").split(" ") if len(self.db.query("select id from renju WHERE gid=?", gid)) > 0: # when gid exists continue renju_game = RenjuGame() for mid, move in enumerate(moves): move = move.strip() if move == "": continue board_stream = board_to_stream(renju_game.board) player = renju_game.player row = ord(move[0]) - ord('a') col = int(move[1:]) - 1 action = renju_game.transform_action((row, col)) # insert self.db.execute("insert INTO renju (gid, mid, board, player, action) VALUES (?, ?, ?, ?, ?)", gid, mid, board_stream, player, action) # do move renju_game.do_move((row, col)) move_count += len(moves) if step % 100 == 0: print "load games= %d / %d" % (step, game_num) logger.info("newly insert games=%d, moves=%d" % (game_num, move_count)) print "finish import moves"
def train_policy_network_rl_distribute(args): policy_planes = args.policy_planes value_planes = args.value_planes # hosts ps_hosts = args.ps_hosts.split(",") worker_hosts = args.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=args.job_name, task_index=args.task_index) if args.job_name == "ps": server.join() elif args.job_name == "worker": if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state( args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork( policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=True, ) init_op = tf.initialize_all_variables() summary_op = tf.merge_all_summaries() sv = tf.train.Supervisor(is_chief=(args.task_index == 0), logdir=policy_rl.model_dir, init_op=init_op, summary_op=summary_op, saver=policy_rl.saver, global_step=policy_rl.global_step, save_model_secs=0) sess = sv.prepare_or_wait_for_session(server.target, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) sess.run(init_op) # Start queue runners for the input pipelines (if any). sv.start_queue_runners(sess) policy_rl.set_session(sess) if model_file is not None: policy_rl.saver.restore(sess, model_file) logger.info("load model file: %s" % model_file) else: policy_rl.restore_model() # load value network if args.policy_rl_phase > 1: value_dl = ValueNetwork( value_planes, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, gpu=args.values_net_gpu, optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: value_dl = None # train policy rl policy_rl.train_policy_network(value_dl, epochs=args.policy_rl_epochs, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def train_policy_network_rl_distribute(args): policy_planes = args.policy_planes value_planes = args.value_planes # hosts ps_hosts = args.ps_hosts.split(",") worker_hosts = args.worker_hosts.split(",") # Create a cluster from the parameter server and worker hosts. cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) # Create and start a server for the local task. server = tf.train.Server(cluster, job_name=args.job_name, task_index=args.task_index) if args.job_name == "ps": server.join() elif args.job_name == "worker": if args.policy_rl_reset: # empty old rl policy network if os.path.exists(args.policy_rl_models_dir): # os.removedirs(args.policy_rl_models_dir) shutil.rmtree(args.policy_rl_models_dir) os.makedirs(args.policy_rl_models_dir) # read parameters from DL policy network checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) if checkpoint and checkpoint.model_checkpoint_path: model_file = checkpoint.model_checkpoint_path else: logger.error("not found policy dl model avaliable", to_exit=True) else: model_file = None # init policy RL network policy_rl = PolicyRLNetwork(policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, gpu=args.policy_rl_gpu, optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=True, ) init_op = tf.initialize_all_variables() summary_op = tf.merge_all_summaries() sv = tf.train.Supervisor(is_chief=(args.task_index == 0), logdir=policy_rl.model_dir, init_op=init_op, summary_op=summary_op, saver=policy_rl.saver, global_step=policy_rl.global_step, save_model_secs=0) sess = sv.prepare_or_wait_for_session(server.target, config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) ) sess.run(init_op) # Start queue runners for the input pipelines (if any). sv.start_queue_runners(sess) policy_rl.set_session(sess) if model_file is not None: policy_rl.saver.restore(sess, model_file) logger.info("load model file: %s" % model_file) else: policy_rl.restore_model() # load value network if args.policy_rl_phase > 1: value_dl = ValueNetwork(value_planes, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, gpu=args.values_net_gpu, optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: value_dl = None # train policy rl policy_rl.train_policy_network(value_dl, epochs=args.policy_rl_epochs, batch_games=args.policy_rl_batch_games, save_step=args.policy_rl_save_step)
def load_model(args, model_type, model_file=None): policy_planes = args.policy_planes value_planes = args.value_planes pattern_features = args.pattern_features if model_type == "policy_dl": model = PolicyDLNetwork( policy_planes, corpus, args, filters=args.policy_dl_filters, board_size=args.board_size, model_dir=args.policy_dl_models_dir, device="gpu", gpu=args.policy_dl_gpu, optimizer=args.policy_dl_optimizer, learn_rate=args.policy_dl_learn_rate, distributed_train=False, ) elif model_type == "policy_rollout": model = PolicyRolloutModel( policy_planes, patterns, args, board_size=args.board_size, model_dir=args.policy_rollout_models_dir, device="cpu", optimizer=args.policy_rollout_optimizer, learn_rate=args.policy_rollout_learn_rate, distributed_train=False, ) elif model_type == "policy_rl": model = PolicyRLNetwork( policy_planes, args, phase=args.policy_rl_phase, filters=args.policy_rl_filters, board_size=args.board_size, model_dir=args.policy_rl_models_dir, device="cpu", optimizer=args.policy_rl_optimizer, learn_rate=args.policy_rl_learn_rate, distributed_train=False, ) elif model_type == "value_net": model = ValueNetwork( value_planes, args, phase=args.values_net_phase, filters=args.values_net_filters, board_size=args.board_size, model_dir=args.values_net_models_dir, device="cpu", optimizer=args.values_net_optimizer, learn_rate=args.values_net_learn_rate, ) else: logger.error("unsupported model type=%s" % model_type, to_exit=True) # init session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) session = tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True, gpu_options=gpu_options)) session.run(tf.initialize_all_variables()) model.set_session(session) # restore model status = model.restore_model(model_file=model_file) if not status and model_type == "policy_rl": checkpoint = tf.train.get_checkpoint_state(args.policy_dl_models_dir) model_file = checkpoint.model_checkpoint_path logger.info("successful load model file: %s" % model_file) model.saver.restore(session, model_file) return model
def train_value_network(self, rpc, sample_num=1000, max_time_steps=225, epochs=20, batch_size=32): """ :param policy_dl: policy network of deep learning :param policy_rl: policy network of reinforcement learning :return: """ model_params = self.param_unserierlize(init_params={ "global_step": 0, "global_epoch": 0 }) if sample_num > 0: # create sample start_time = time.time() sample_file = "data/value_net_phase_%d_samples_%d.pkl" % ( self.phase, sample_num) sample_games = sampling_for_value_network( rpc, sample_num, sample_file, max_time_steps=max_time_steps) elapsed_time = int((time.time() - start_time) * 1000) logger.info("sampling for value network, samples=%d, time=%d(ms)" % (sample_num, elapsed_time)) cPickle.dump(sample_games, open(sample_file, "wb"), protocol=2) logger.info("save sample file: %s" % sample_file) model_params["sample_file"] = sample_file self.param_serierlize(model_params) else: # load old sample if 'sample_file' not in model_params: logger.error("not found sample file", to_exit=True) sample_games = cPickle.load(open(model_params["sample_file"], 'rb')) epoch_step, train_step = model_params["global_epoch"], model_params[ "global_step"] while epoch_step < (model_params["global_epoch"] + epochs): start_time = time.time() epoch_step += 1 random.shuffle(sample_games) avg_loss = 0.0 for idx in xrange(0, len(sample_games), batch_size): end_idx = min(len(sample_games), idx + batch_size) mini_samples = sample_games[idx:end_idx] # transform sample data mini_states = [ sampled_game.get_states(player_plane=True) for sampled_game, _ in mini_samples ] mini_rewards = [ sampled_reward for _, sampled_reward in mini_samples ] fetch_status = self.fit(mini_states, mini_rewards, fetch_info=True) _, train_step, loss = fetch_status avg_loss += loss train_step = int(train_step) if train_step % 20 == 0: elapsed_time = int((time.time() - start_time) * 1000) logger.info( "train value network, phase=%d, epoch=%d, step=%d, loss=%.7f, time=%d(ms)" % (self.phase, epoch_step, train_step, loss, elapsed_time)) start_time = time.time() avg_loss /= math.ceil(len(sample_games) / batch_size) logger.info( "train value network, phase=%d, epoch=%d, avg_loss=%.6f" % (self.phase, epoch_step, avg_loss)) if epoch_step % 5 == 0: # save model model_params["global_step"] = train_step model_params["global_epoch"] = epoch_step self.param_serierlize(model_params) model_file = self.save_model( "value_net_phase_%d" % self.phase, global_step=model_params["global_step"]) logger.info("save value network model, file=%s" % model_file)