class PolicyNetworkBestMovePlayer(GtpInterface): def __init__(self, read_file): self.policy_network = PolicyNetwork(DEFAULT_FEATURES.planes, use_cpu=True) self.read_file = read_file super().__init__() def clear(self): super().clear() self.refresh_network() def refresh_network(self): # Ensure that the player is using the latest version of the network # so that the network can be continually trained even as it's playing. self.policy_network.initialize_variables(self.read_file) def suggest_move(self, position): if position.recent and position.n > 100 and position.recent[-1] == None: # Pass if the opponent passes return None move_probabilities = self.policy_network.run(position) for move in sorted_moves(move_probabilities): if go.is_reasonable(position, move): return move return None
def __init__(self, read_file, seconds_per_move=5): self.seconds_per_move = seconds_per_move self.max_rollout_depth = go.N * go.N * 3 self.policy_network = PolicyNetwork(DEFAULT_FEATURES.planes, use_cpu=True) self.read_file = read_file super().__init__()
def make_gtp_instance(strategy_name, read_file): n = PolicyNetwork(use_cpu=True) n.initialize_variables(read_file) if strategy_name == 'random': instance = RandomPlayer() elif strategy_name == 'policy': instance = GreedyPolicyPlayer(n) elif strategy_name == 'randompolicy': instance = RandomPolicyPlayer(n) elif strategy_name == 'mcts': instance = MCTSPlayer(n) else: return None gtp_engine = gtp.Engine(instance) return gtp_engine
def gtp(strategy, read_file=None): n = PolicyNetwork(use_cpu=True) if strategy == 'random': instance = RandomPlayer() elif strategy == 'policy': instance = PolicyNetworkBestMovePlayer(n, read_file) elif strategy == 'randompolicy': instance = PolicyNetworkRandomMovePlayer(n, read_file) elif strategy == 'mcts': instance = MCTS(n, read_file) else: sys.stderr.write("Unknown strategy") sys.exit() gtp_engine = gtp_lib.Engine(instance) sys.stderr.write("GTP engine ready\n") sys.stderr.flush() while not gtp_engine.disconnect: inpt = input() # handle either single lines at a time # or multiple commands separated by '\n' try: cmd_list = inpt.split("\n") except: cmd_list = [inpt] for cmd in cmd_list: engine_reply = gtp_engine.send(cmd) sys.stdout.write(engine_reply) sys.stdout.flush()
def gtp(strategy, read_file=None): n = PolicyNetwork(use_cpu=True) if strategy == 'random': instance = RandomPlayer() elif strategy == 'policy': instance = PolicyNetworkBestMovePlayer(n, read_file) elif strategy == 'randompolicy': instance = PolicyNetworkRandomMovePlayer(n, read_file) elif strategy == 'mcts': instance = MCTS(n, read_file) else: sys.stderr.write("错误") sys.exit() gtp_engine = gtp_lib.Engine(instance) sys.stderr.write("GTP\n") sys.stderr.flush() while not gtp_engine.disconnect: inpt = input() try: cmd_list = inpt.split("\n") except: cmd_list = [inpt] for cmd in cmd_list: engine_reply = gtp_engine.send(cmd) sys.stdout.write(engine_reply) sys.stdout.flush()
def initialize(self): try: n = PolicyNetwork(use_cpu=True) instance = PolicyNetworkBestMovePlayer(n, self.moudle_file) self.gtp_engine = gtp_lib.Engine(instance) except BaseException as e: raise Exception('Initialization of policy network failed')
def AI(msg): global read_file # 提取信息 x = msg['msg'][2].upper() y = string.index(msg['msg'][3]) color = '' if msg['msg'][0] == 'B': color = 'W' else: color = 'B' # 初始化策略网络 n = PolicyNetwork(use_cpu=True) instance = PolicyNetworkBestMovePlayer(n, read_file) gtp_engine = gtp_lib.Engine(instance) # sys.stderr.write("GTP Enginene ready\n") AI_cmd = parse_AI_instruction(color) # 查看是否已经开始下棋并记录 if os.path.exists(data_file): rfile = open(data_file, 'r') cmd_list = rfile.readlines() for cmd in cmd_list: cmd = cmd.strip('\n ') if cmd == '': continue gtp_engine.send(cmd) # sys.stdout.write(cmd) # sys.stdout.flush() rfile.close() # 解析对方下棋指令,写进data wfile = open(data_file, 'a') player_cmd = parse_player_input(msg['msg'][0], x, y) wfile.write(player_cmd + '\n') gtp_engine.send(player_cmd) # sys.stdout.write(player_cmd + '\n') # sys.stdout.flush() gtp_reply = gtp_engine.send(AI_cmd) gtp_cmd = parse_AI_input(color, gtp_reply) wfile.write(gtp_cmd) wfile.close() # sys.stdout.write(gtp_reply) # sys.stdout.flush() response = color + '[' + gtp_reply[2].lower() + string[int( gtp_reply[3:])] + ']' # sys.stdout.write(response) # sys.stdout.flush() return {'game_id': msg['game_id'], 'msg': response}
def AI(msg): global read_file # Extract information data_file = data_file_path + '[2]' + msg['game_id'] x, y, color = parse_input_msg(msg) print(x, y, color) # Initialize the policy network n = PolicyNetwork(use_cpu=True) instance = PolicyNetworkBestMovePlayer(n, read_file) gtp_engine = gtp_lib.Engine(instance) # sys.stderr.write("GTP Enginene ready\n") AI_cmd = parse_AI_instruction(color) # To see if it has started playing chess and logging if os.path.exists(data_file): rfile = open(data_file, 'r') cmd_list = rfile.readlines() for cmd in cmd_list: cmd = cmd.strip('\n ') if cmd == '': continue gtp_engine.send(cmd) # sys.stdout.write(cmd + '\n') # sys.stdout.flush() rfile.close() # Parse the other side of the chess instructions, write into the record file wfile = open(data_file, 'a') if msg['msg'][2].lower() == 't' and msg['msg'][3].lower() == 't': pass else: player_cmd = parse_player_input(msg['msg'][0], x, y) wfile.write(player_cmd + '\n') gtp_engine.send(player_cmd) # sys.stdout.write(player_cmd + '\n') # sys.stdout.flush() gtp_reply = gtp_engine.send(AI_cmd) gtp_cmd = parse_AI_input(color, gtp_reply) wfile.write(gtp_cmd) wfile.close() # sys.stdout.write(gtp_reply + '\n') # sys.stdout.flush() AI_x, AI_y = parse_AI_reply(gtp_reply) response = color + '[' + AI_x + AI_y + ']' # sys.stdout.write(response) # sys.stdout.flush() return {'game_id': msg['game_id'], 'msg': response}
def gtp(strategy, read_file=None): network = PolicyNetwork() instance = MCTS(network, read_file) gtp_engine = gtp_lib.Engine(instance) print('gtp engine ready') while not gtp_engine.disconnect: inpt = input() try: cmd_list = inpt.split('\n') except: cmd_list = [inpt] for cmd in cmd_list: print('sending cmd %s' % cmd) engine_reply = gtp_engine.send(cmd) print(engine_reply)
def __init__(self, game_id, mode=0, moudle_file=DEFAULT_AI_MOUDLE_FILE, debug=False): ''' :param mode: mode==0 -> human vs AI mode==1 -> AI vs AI :param game_id: string :param moudle_file: string the AI moudle file ''' # Activate Logging Debug Information self.debug = debug # initialize self.game_id = game_id self.command_list = [] if not (mode == 1 or mode == 0): raise Exception('Invalid Game Mode') else: self.mode = mode self.moudle_file = moudle_file try: n = PolicyNetwork(use_cpu=True) instance = PolicyNetworkBestMovePlayer(n, self.moudle_file) self.gtp_engine = gtp_lib.Engine(instance) except BaseException as e: raise Exception('Initialization of policy network failed') # TODO: Remove the code below if using remote database # Using path 'game_database/data/' to store game data. # Make sure the path exists ! self.local_data_filepath = 'game_database/data/' self.data_file = self.local_data_filepath + self.game_id + '.data'
def train(processed_dir, save_dir, logdir, read_file=None, epochs=50, checkpoint_freq=2): test_dataset = DataSet.read(os.path.join(processed_dir, "test.chunk.gz")) train_dataset = DataSet.read(os.path.join(processed_dir, "train.chunk.gz")) if not os.path.exists(save_dir): os.makedirs(save_dir) print("=====Network initilization=====") net = PolicyNetwork(logdir=logdir, read_file=read_file) print("=====Start training...=====") for i in range(epochs): net.train(train_dataset, test_dataset) if i % checkpoint_freq == 0: net.save_variables( os.path.join(save_dir, "epoch_" + str(i) + ".ckpt"))
def train(processed_dir, read_file=None, save_file=None, epochs=10, logdir=None, checkpoint_freq=10000): test_dataset = DataSet.read(os.path.join(processed_dir, 'test.chunk.gz')) #print(test_dataset) train_chunk_files = [ os.path.join(processed_dir, fname) for fname in os.listdir(processed_dir) if TRAINING_CHUNK_RE.match(fname) ] print(train_chunk_files) if read_file is not None: read_file = os.path.join(os.getcwd(), save_file) n = PolicyNetwork() n.initialize_variables() if logdir is not None: n.initialize_logging(logdir) last_save_checkpoint = 0 for i in range(epochs): random.shuffle(train_chunk_files) for file in tqdm.tqdm(train_chunk_files, desc='epochs ' + str(i)): #print('Using %s' % file) with timer('load dataset'): train_dataset = DataSet.read(file) with timer('training'): n.train(train_dataset) if n.get_global_step() > last_save_checkpoint + checkpoint_freq: with timer('save model'): n.save_variables(save_file) with timer('test set evaluation'): n.check_accuracy(test_dataset) last_save_checkpoint = n.get_global_step() with timer('test set evaluation'): n.check_accuracy(test_dataset)
class MCTS(GtpInterface): def __init__(self, read_file, seconds_per_move=5): self.seconds_per_move = seconds_per_move self.max_rollout_depth = go.N * go.N * 3 self.policy_network = PolicyNetwork(DEFAULT_FEATURES.planes, use_cpu=True) self.read_file = read_file super().__init__() def clear(self): super().clear() self.refresh_network() def refresh_network(self): # Ensure that the player is using the latest version of the network # so that the network can be continually trained even as it's playing. self.policy_network.initialize_variables(self.read_file) def suggest_move(self, position): if position.caps[0] + 50 < position.caps[1]: return gtp.RESIGN start = time.time() move_probs = self.policy_network.run(position) root = MCTSNode.root_node(position, move_probs) while time.time() - start < self.seconds_per_move: self.tree_search(root) # there's a theoretical bug here: if you refuse to pass, this AI will # eventually start filling in its own eyes. return max(root.children.keys(), key=lambda move, root=root: root.children[move].N) def tree_search(self, root): print("tree search", file=sys.stderr) # selection chosen_leaf = root.select_leaf() # expansion position = chosen_leaf.compute_position() if position is None: print("illegal move!", file=sys.stderr) # See go.Position.play_move for notes on detecting legality del chosen_leaf.parent.children[chosen_leaf.move] return print("Investigating following position:\n%s" % (chosen_leaf.position, ), file=sys.stderr) move_probs = self.policy_network.run(position) chosen_leaf.expand(move_probs) # evaluation value = self.estimate_value(chosen_leaf) # backup print("value: %s" % value, file=sys.stderr) chosen_leaf.backup_value(value) def estimate_value(self, chosen_leaf): # Estimate value of position using rollout only (for now). # (TODO: Value network; average the value estimations from rollout + value network) leaf_position = chosen_leaf.position current = leaf_position while current.n < self.max_rollout_depth: move_probs = self.policy_network.run(current) current = self.play_valid_move(current, move_probs) if len(current.recent ) > 2 and current.recent[-1] == current.recent[-2] == None: break else: print("max rollout depth exceeded!", file=sys.stderr) perspective = 1 if leaf_position.player1turn else -1 return current.score() * perspective def play_valid_move(self, position, move_probs): for move in sorted_moves(move_probs): if go.is_eyeish(position.board, move): continue candidate_pos = position.play_move(move, mutate=True) if candidate_pos is not None: return candidate_pos return position.pass_move(mutate=True)
def self_play(strategy, read_file=None): n = PolicyNetwork(use_cpu=True) if strategy == 'random': instance = RandomPlayer() elif strategy == 'policy': instance = PolicyNetworkBestMovePlayer(n, read_file) elif strategy == 'randompolicy': instance = PolicyNetworkRandomMovePlayer(n, read_file) elif strategy == 'mcts': instance = MCTS(n, read_file) else: sys.stderr.write("Unknown strategy") sys.exit() #instance神经网络 gtp_engine = gtp_lib.Engine(instance) sys.stderr.write("GTP engine ready\n") sys.stderr.flush() p1 = -1 save = '' inpt = 'genmove b' n = 500 while n > 0: inpt = 'genmove b' if n % 2 == 1: inpt = 'genmove b' else: inpt = 'genmove w' try: cmd_list = inpt.split("\n") except: cmd_list = [inpt] for cmd in cmd_list: engine_reply = gtp_engine.send(cmd) sys.stdout.write(engine_reply) if engine_reply == '= pass\n\n': #engine_reply == '= pass\n\n' n = 0 else: o1 = '' if len(engine_reply) == 7: o1 = engine_reply[3] + engine_reply[4] else: o1 = engine_reply[3] if n % 2 == 1: o2 = ch.change(engine_reply[2]) + ch.change(o1) save = save + ';B[' + ch.change( engine_reply[2]) + ch.change(o1) + ']' else: o2 = ch.change(engine_reply[2]) + ch.change(o1) save = save + ';W[' + ch.change( engine_reply[2]) + ch.change(o1) + ']' sys.stdout.flush() n = n - 1 p7 = instance.position.result() save2 = '(;GM[1]\n SZ[19]\nPB[go1]\nPW[go2]\nKM[6.50]\nRE[' + p7[0] + ']\n' save2 = save2 + save + ')' wenjian = '' wenjian = str(time.time()) p3 = '4' save_t.make_folder(wenjian + '_selfplay') save_t.save_txt(wenjian + '_selfplay', p3, save2)
def __init__(self, read_file): self.policy_network = PolicyNetwork(DEFAULT_FEATURES.planes, use_cpu=True) self.read_file = read_file super().__init__()
def train(processed_dir="processed_data"): checkpoint_freq = 10000 read_file = None save_file = 'tmp2' epochs = 10 logdir = 'logs2' # test_dataset = DataSet.read(os.path.join(processed_dir, "test.chunk.gz")) train_chunk_files = [ os.path.join(processed_dir, fname) for fname in os.listdir(processed_dir) if TRAINING_CHUNK_RE.match(fname) ] if read_file is not None: read_file = os.path.join(os.getcwd(), save_file) n = PolicyNetwork() n.initialize_variables(read_file) if logdir is not None: n.initialize_logging(logdir) last_save_checkpoint = 0 for i in range(epochs): random.shuffle(train_chunk_files) for file in train_chunk_files: print("提取 %s" % file) with timer("load dataset"): train_dataset = DataSet.read(file) with timer("training"): n.train(train_dataset) with timer("save model"): n.save_variables(save_file) if n.get_global_step() > last_save_checkpoint + checkpoint_freq: with timer("test set evaluation"): n.check_accuracy(test_dataset) last_save_checkpoint = n.get_global_step()
def train(processed_dir, read_file=None, save_file=None, epochs=10, logdir=None, checkpoint_freq=10000): test_dataset = DataSet.read(os.path.join(processed_dir, "test.chunk.gz")) train_chunk_files = [os.path.join(processed_dir, fname) for fname in os.listdir(processed_dir) if TRAINING_CHUNK_RE.match(fname)] if read_file is not None: read_file = os.path.join(os.getcwd(), save_file) n = PolicyNetwork() n.initialize_variables(read_file) if logdir is not None: n.initialize_logging(logdir) last_save_checkpoint = 0 for i in range(epochs): random.shuffle(train_chunk_files) for file in train_chunk_files: print("Using %s" % file) with timer("load dataset"): train_dataset = DataSet.read(file) with timer("training"): n.train(train_dataset) with timer("save model"): n.save_variables(save_file) if n.get_global_step() > last_save_checkpoint + checkpoint_freq: with timer("test set evaluation"): n.check_accuracy(test_dataset) last_save_checkpoint = n.get_global_step()
def train(processed_dir, read_file=None, save_file=None, epochs=10, logdir=None, checkpoint_freq=10000): test_dataset = DataSet.read(os.path.join(processed_dir, "test.chunk.gz")) train_chunk_files = [ os.path.join(processed_dir, fname) for fname in os.listdir(processed_dir) if TRAINING_CHUNK_RE.match(fname) ] n = PolicyNetwork(DEFAULT_FEATURES.planes) n.initialize_variables(read_file) if logdir is not None: n.initialize_logging(logdir) last_save_checkpoint = 0 for i in range(epochs): random.shuffle(train_chunk_files) for file in train_chunk_files: print("Using %s" % file) train_dataset = DataSet.read(file) n.train(train_dataset) if save_file is not None and n.get_global_step( ) > last_save_checkpoint + checkpoint_freq: n.check_accuracy(test_dataset) print("Saving checkpoint to %s" % save_file, file=sys.stderr) last_save_checkpoint = n.get_global_step() n.save_variables(save_file) if save_file is not None: n.save_variables(save_file) print("Finished training. New model saved to %s" % save_file, file=sys.stderr)
def AI(msgs, model=DEFAULT_MODEL_PATH, strategy=None): print("AI(msg) called,strategy:", strategy) # data_file = data_file_path + msg lastMsg = msgs[len(msgs) - 1] x, y, color = parse_input_msg(lastMsg) print('AI(lastMsg) parsed:', x, y, color) # Initialize the policy network n = PolicyNetwork(use_cpu=True) print("PolicyNetwork init.") # global read_file # read_file = read_file_prefix+str(RANK)+"/savedmodel" print("n,read_file:", n, model) if strategy == 'random': global instance instance = RandomPlayer() elif strategy == 'best_move': global instance instance = PolicyNetworkBestMovePlayer(n, model) elif strategy == 'random_move': global instance instance = PolicyNetworkRandomMovePlayer(n, model) elif strategy == 'mcts': global instance instance = MCTS(n, model) #instance = PolicyNetworkRandomMovePlayer(n, read_file) print("PolicyNetwork instanced.", instance) try: global gtp_engine gtp_engine = gtp_lib.Engine(instance) except Exception: print(traceback.format_exc()) print("GTP Engine get ready.") #sys.stderr.write("GTP Enginene ready\n") AI_cmd = parse_AI_instruction(color) print("AI_cmd parsed.") # To see if it has started playing chess and logging # try: # data_file_exist = os.path.exists(data_file) # except Exception: # print(traceback.format_exc()) # print("os.path.exists?",data_file_exist) #sys.setdefaultencoding('utf-8') # if os.path.exists(data_file): # print("os.path.exists(data_file)!") # rfile = open(data_file, 'r') # cmd_list = rfile.readlines() # for cmd in cmd_list: # cmd = cmd.strip('\n ') # if cmd == '': # continue # print("gtp_engine.send(cmd):", cmd) # gtp_engine.send(cmd) # # sys.stdout.write(cmd + '\n') # # sys.stdout.flush() # rfile.close() # # Parse the other side of the chess instructions, write into the record file # wfile = open(data_file, 'a') # print("wfiled!!!") # if msg['msg'][2].lower() == 't' and msg['msg'][3].lower() == 't': # pass # else: # player_cmd = parse_player_input(msg['msg'][0], x, y) # wfile.write(player_cmd + '\n') # gtp_engine.send(player_cmd) # sys.stdout.write(player_cmd + '\n') # sys.stdout.flush() for msg in msgs: x, y, color = parse_input_msg(msg) player_cmd = parse_player_input(color, x, y) print("gtp_engine.send(cmd):", player_cmd) gtp_engine.send(player_cmd) gtp_reply = gtp_engine.send(AI_cmd) gtp_cmd = parse_AI_input(color, gtp_reply) # wfile.write(gtp_cmd) # wfile.close() # sys.stdout.write(gtp_reply + '\n') # sys.stdout.flush() AI_x, AI_y = parse_AI_reply(gtp_reply) response = color + '[' + AI_x + AI_y + ']' # sys.stdout.write(response) # sys.stdout.flush() return {'game_id': msg['game_id'], 'msg': response}
new_vars = [] for name, shape in policy_vars: v = tf.contrib.framework.load_variable('model/sl/', name) new_vars.append( tf.Variable(v, name=name.replace('PolicNetwork', 'PlayerNetwork'))) saver = tf.train.Saver(new_vars) sess.run(tf.global_variables_initializer()) saver.save(sess, os.path.join(save_dir, str(t), 'player' + str(t) + '.ckpt')) g1 = tf.Graph() with g1.as_default(): train_net = PolicyNetwork(scope="PolicNetwork") train_net.initialize_variables('model/sl/epoch_48.ckpt') pos = go.Position() train_net.run(pos) g2 = tf.Graph() with g2.as_default(): player_net = PolicyNetwork(scope="PlayerNetwork") player_net.initialize_variables('model/rl/2/player2.ckpt') pos = go.Position() player_net.run(pos) save_trained_policy(1, 'model/rl') print("===========load new model=================")
else: action_dim = env.action_space.n if observation_space_is(env, gym.spaces.Box): state_dim = env.observation_space.shape[0] else: state_dim = env.observation_space.n hidden_dim = 256 value_net = ValueNetwork(state_dim, hidden_dim).to(device) target_value_net = ValueNetwork(state_dim, hidden_dim).to(device) soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) for target_param, param in zip(target_value_net.parameters(), value_net.parameters()): target_param.data.copy_(param.data) value_criterion = nn.MSELoss() soft_q_criterion1 = nn.MSELoss() soft_q_criterion2 = nn.MSELoss() value_lr = 3e-4 soft_q_lr = 3e-4 policy_lr = 3e-4 value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr) soft_q_optimizer1 = optim.Adam(soft_q_net1.parameters(), lr=soft_q_lr)
def AI(msg): print("AI(msg) called.") global read_file # Extract information data_file = data_file_path + msg['game_id'] x, y, color = parse_input_msg(msg) print(x, y, color) # Initialize the policy network n = PolicyNetwork(use_cpu=True) print("PolicyNetwork init.") print("n,read_file:", n, read_file) try: instance = PolicyNetworkBestMovePlayer(n, read_file) except Exception: print(traceback.format_exc()) #instance = PolicyNetworkRandomMovePlayer(n, read_file) print("PolicyNetwork instanced.", instance) try: global gtp_engine gtp_engine = gtp_lib.Engine(instance) print("GTP Engine get ready.", gtp_engine) except Exception: print(traceback.format_exc()) #sys.stderr.write("GTP Enginene ready\n") AI_cmd = parse_AI_instruction(color) print("AI_cmd parsed.") # To see if it has started playing chess and logging try: data_file_exist = os.path.exists(data_file) except Exception: print(traceback.format_exc()) print("os.path.exists?", data_file_exist) #sys.setdefaultencoding('utf-8') if os.path.exists(data_file): print("os.path.exists(data_file)!") rfile = open(data_file, 'r') cmd_list = rfile.readlines() for cmd in cmd_list: cmd = cmd.strip('\n ') if cmd == '': continue print("gtp_engine.send(cmd):", cmd) gtp_engine.send(cmd) # sys.stdout.write(cmd + '\n') # sys.stdout.flush() rfile.close() # Parse the other side of the chess instructions, write into the record file wfile = open(data_file, 'a') print("wfiled!!!") if msg['msg'][2].lower() == 't' and msg['msg'][3].lower() == 't': pass else: player_cmd = parse_player_input(msg['msg'][0], x, y) wfile.write(player_cmd + '\n') gtp_engine.send(player_cmd) # sys.stdout.write(player_cmd + '\n') # sys.stdout.flush() gtp_reply = gtp_engine.send(AI_cmd) gtp_cmd = parse_AI_input(color, gtp_reply) wfile.write(gtp_cmd) wfile.close() # sys.stdout.write(gtp_reply + '\n') # sys.stdout.flush() AI_x, AI_y = parse_AI_reply(gtp_reply) response = color + '[' + AI_x + AI_y + ']' # sys.stdout.write(response) # sys.stdout.flush() return {'game_id': msg['game_id'], 'msg': response}
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from IPython import display import pylab as pl import numpy as np import os import random import re import sys import go from policy import PolicyNetwork from strategies import MCTSPlayerMixin read_file = "saved_models/20170718" WHITE, EMPTY, BLACK, FILL, KO, UNKNOWN = range(-1, 5) n = PolicyNetwork(use_cpu=True) n.initialize_variables(read_file) instance = MCTSPlayerMixin(n) class User(): def __init__(self, name, state_size, action_size): self.name = name self.state_size = state_size self.action_size = action_size def act(self, state, tau): action = int(input('Enter your chosen action: ')) pi = np.zeros(self.action_size) pi[action] = 1 value = None
def train(processed_dir, save_file=None, epochs=10, logdir=None, checkpoint_freq=10000): test_dataset = DataSet.read(os.path.join(processed_dir, "test.chunk.gz")) train_chunk_files = [ os.path.join(processed_dir, fname) for fname in os.listdir(processed_dir) if TRAINING_CHUNK_RE.match(fname) ] save_file = os.path.join(os.getcwd(), save_file) n = PolicyNetwork() try: n.initialize_variables(save_file) except: n.initialize_variables(None) if logdir is not None: n.initialize_logging(logdir) last_save_checkpoint = 0 for i in range(epochs): random.shuffle(train_chunk_files) for file in train_chunk_files: print("Using %s" % file) train_dataset = DataSet.read(file) train_dataset.shuffle() with timer("training"): n.train(train_dataset) n.save_variables(save_file) if n.get_global_step() > last_save_checkpoint + checkpoint_freq: with timer("test set evaluation"): n.check_accuracy(test_dataset) last_save_checkpoint = n.get_global_step()
def run_iterations(args): # Init model state_size = 16 action_size = 4 if args.env == "MountainCar-v0": state_size = 2 action_size = 3 if args.env == "Freeway-ram-v0": state_size = 128 action_size = 3 if args.env == "CartPole-v0": state_size = 4 action_size = 2 model = PolicyNetwork(state_size, action_size) optimizer = optim.Adam(model.parameters(), args.learning_rate) start_n = 4 reward_per_iteration = [] for i in range(args.max_iterations): # boolean for demo if not args.demo: state = to_tensor(ENV.reset(), state_size) else: # start_n, nde state van demo pakken om als start state te gebruiken # hoe deze te kiezen samen met max_iterations, elke start state paar keer doen of 1x? start_state = get_start_state(ENV, args.env, start_n) # probleem met ene environment ENV.env.s en andere ENV.env.state; misschien elegantere oplossing? if args.env == "FrozenLake-v0": ENV.env.s = start_state state = to_tensor(ENV.env.s, state_size) else: ENV.env.state = start_state state = to_tensor(ENV.env.state, state_size) reward_per_episode = [] episode_loss = 0 for step in range(args.max_steps): if args.render: ENV.render() action = select_action(model, state, get_epsilon(i), action_size) next_state, reward, done, _ = ENV.step(action) # take a random action # compute the q value q_val = compute_q_val(model, state, action) with torch.no_grad(): # Don't compute gradient info for the target (semi-gradient) next_state = to_tensor(next_state, state_size) target = compute_target(model, reward, next_state, done, args.discount_factor) # loss is measured from error between current and newly expected Q values loss = F.smooth_l1_loss(q_val, target) # backpropagation of loss to Neural Network (PyTorch magic) optimizer.zero_grad() loss.backward() optimizer.step() episode_loss += loss state = next_state reward_per_episode.append(reward) if done: break if i % args.print_every == 0: print("Reward", reward, sum(reward_per_episode)) print("Step {:6d} with loss: {:4f}".format(i, episode_loss)) reward_per_iteration.append(reward_per_episode) return reward_per_iteration