def __init__(self, lmbda=0.5, c_puct=1, n_thr=15, time_limit=10): self.root = Node(None, 1.0) self.policy_net = network.SLPolicy() serializers.load_npz('./models/sl_model.npz', self.policy_net) self.value_net = network.Value() serializers.load_npz('./models/value_model.npz', self.value_net) chainer.config.train = False chainer.config.enable_backprop = False self.lmbda = lmbda self.c_puct = c_puct self.n_thr = n_thr self.time_limit = time_limit
def __init__(self, auto): # Initialize board state if auto: self.p1 = "IaGo(SLPolicy)" self.model = network.SLPolicy() serializers.load_npz('./models/sl_model.npz', self.model) else: self.p1 = "You" self.model = None self.p2 = "IaGo(PV-MCTS)" self.state = np.zeros([8, 8], dtype=np.float32) self.state[4, 3] = 1 self.state[3, 4] = 1 self.state[3, 3] = 2 self.state[4, 4] = 2 # Initialize game variables self.stone_num = 4 self.play_num = 1 self.pass_flg = False self.date = datetime.now().strftime("%Y-%m-%d-%H-%M") self.gamelog = "IaGo \n" + self.date + "\n" self.mcts = MCTS.MCTS()
def main(): # Set the number of epochs and policy to train parser = argparse.ArgumentParser(description='IaGo:') parser.add_argument('--epoch', '-e', type=int, default=10, help='Number of sweeps over the dataset to train') parser.add_argument('--policy', '-p', type=str, default="sl", help='Policy to train: sl or rollout') parser.add_argument('--gpu', '-g', type=int, default="0", help='GPU ID') args = parser.parse_args() # Model definition if args.policy == "rollout": model = network.RolloutPolicy() else: if args.policy != "sl": print( 'Argument "--policy" is invalid. SLPolicy has been set by default.' ) model = network.SLPolicy() optimizer = optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) cuda.get_device(args.gpu).use() X_test = np.load('../policy_data/npy/states_test.npy') y_test = np.load('../policy_data/npy/actions_test.npy') X_test, y_test = transform(X_test, y_test) # Load train dataset X_train = np.load('../policy_data/npy/states.npy') y_train = np.load('../policy_data/npy/actions.npy') train_size = y_train.shape[0] minibatch_size = 4096 # 2**12 # Learing loop for epoch in tqdm(range(args.epoch)): model.to_gpu(args.gpu) # Shuffle train dataset rands = np.random.choice(train_size, train_size, replace=False) X_train = X_train[rands, :, :] y_train = y_train[rands] # Minibatch learning for idx in tqdm(range(0, train_size, minibatch_size)): x = X_train[idx:min(idx + minibatch_size, train_size), :, :] y = y_train[idx:min(idx + minibatch_size, train_size)] x, y = transform(x, y) pred_train = model(x) loss_train = F.softmax_cross_entropy(pred_train, y) model.cleargrads() loss_train.backward() optimizer.update() # Calculate loss with chainer.using_config('train', False): with chainer.using_config('enable_backprop', False): pred_test = model(X_test) loss_test = F.softmax_cross_entropy(pred_test, y_test).data test_acc = F.accuracy(pred_test, y_test).data print('\nepoch :', epoch, ' loss :', loss_test, ' accuracy:', test_acc) # Log if args.policy == "rollout": with open("../log/rollout.txt", "a") as f: f.write(str(loss_test) + ", " + str(test_acc) + "\n") else: with open("../log/sl.txt", "a") as f: f.write(str(loss_test) + ", " + str(test_acc) + "\n") # Save models model.to_cpu() if args.policy == "rollout": serializers.save_npz('../models/rollout_model.npz', model) serializers.save_npz('../models/rollout_optimizer.npz', optimizer) else: serializers.save_npz('../models/sl_model.npz', model) serializers.save_npz('../models/sl_optimizer.npz', optimizer)
def main(): # Set the number of sets parser = argparse.ArgumentParser(description='IaGo:') parser.add_argument('--models', '-m', type=int, default=1, help='Number of trained models') parser.add_argument('--set', '-s', type=int, default=1000, help='Number of game sets played to train') args = parser.parse_args() N = 32 # Model definition model1 = network.SLPolicy() serializers.load_npz("../models/RL/model2.npz", model1) optimizer = optimizers.Adam() optimizer.setup(model1) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4)) serializers.load_npz("../models/RL/optimizers/2.npz", optimizer) # REINFORCE algorithm models = args.models cnt = 0 #for set in tqdm(range(0, args.set)): while (models <= 20): # Randomly choose competitor model from reinforced models model2 = network.SLPolicy() model2_path = np.random.choice(glob.glob("../models/RL/*.npz")) print(model2_path) serializers.load_npz(model2_path, model2) result = 0 state_seq, action_seq, reward_seq = [], [], [] for i in tqdm(range(2 * N)): game = rl_self_play.Game(model1, model2) if i % 2 == 1: # Switch head and tail pos = random.choice([[2, 4], [3, 5], [4, 2], [5, 3]]) game.state[pos[0], pos[1]] = 2 states, actions, judge = game() rewards = [judge] * len(states) state_seq += states action_seq += actions reward_seq += rewards if judge == 1: result += 1 # Update model x = np.array(state_seq) x = np.stack([x == 1, x == 2], axis=0).astype(np.float32) x = Variable(x.transpose(1, 0, 2, 3)) y = Variable(np.array(action_seq).astype(np.int32)) r = Variable(np.array(reward_seq).astype(np.float32)) pred = model1(x) c = F.softmax_cross_entropy(pred, y, reduce="no") model1.cleargrads() loss = F.mean(c * r) loss.backward() optimizer.update() rate = result / (2 * N) print("Models:" + str(models) + ", Result:" + str(rate) + ", Loss:" + str(loss.data)) with open("../log/rl.txt", "a") as f: f.write(str(rate) + ", \n") if rate > 0.5: cnt += 1 if cnt > 4 * np.sqrt(models) and rate > 0.6: model = copy.deepcopy(model1) #model.to_cpu() serializers.save_npz("../models/RL/model" + str(models) + ".npz", model) serializers.save_npz( "../models/RL/optimizers/" + str(models) + ".npz", optimizer) models += 1 cnt = 0 if rate < 0.2: break