class TestGameState(unittest.TestCase): def setUp(self): self.gs = GameState() def test_outputs_to_move_max(self): outputs = np.linspace(0.0, 1.0, 100) self.gs.outputs_to_move_max(outputs) self.assertTrue((self.gs.board == np.array([[-1, -1, -2, -1, -1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 1, 2, 1, 0]])).all()) self.gs.outputs_to_move_max(outputs) self.assertFalse((self.gs.board == np.array([[-1, -1, -2, -1, -1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 1, 2, 1, 0]])).all()) def test_outputs_to_move_random(self): outputs = np.linspace(0.0, 1.0, 100) outputs /= np.sum(outputs) self.gs.outputs_to_move_random(outputs) def test_flip(self): self.assertTrue((self.gs.to_inputs() == self.gs.to_inputs(True)).all())
def replay(self, wps, pi_mcts, board_logs, plus_turns, weights, batch_size: int, beta: float) -> None: inputs = np.zeros((batch_size, 7, 5, 3)) policy_true = np.zeros((batch_size, 315)) values_true = np.zeros((batch_size)) input_weights = np.zeros((batch_size)) indices = np.random.choice(np.arange(len(wps)), size=batch_size, replace=False) mini_batch = [(wps[i], pi_mcts[i], board_logs[i], plus_turns[i], weights[i]) for i in indices] for i, (winner, pi, board, plus_turn, weight) in enumerate(mini_batch): gs = GameState() gs.board = board inputs[i] = gs.to_inputs(flip=not plus_turn) # shape=(4, 5, 5) policy_true[i] = pi**beta values_true[i] = winner input_weights[i] = weight # epochsは訓練データの反復回数、verbose=0は表示なしの設定 self.model.fit(inputs, [policy_true, values_true], sample_weight=input_weights, epochs=1, verbose=0, shuffle=True)
def learn(model_config_path=None, weight_path=None): config = Config() qc = config.Qlearn total_reward_vec = np.zeros(qc.num_consecutive_iterations) # 各試行の報酬を格納 # Qネットワークとメモリ、Actorの生成-------------------------------------------------------- if model_config_path is None or weight_path is None: mainQN = QNetwork(config) # メインのQネットワーク mainQN.build() targetQN = QNetwork(config) # 価値を計算するQネットワーク targetQN.build() else: mainQN = QNetwork(config) success_load = mainQN.load(model_config_path, weight_path) if not success_load: raise FileNotFoundError( f"{model_config_path} {weight_path}が読み込めませんでした") targetQN = QNetwork(config) targetQN.load(model_config_path, weight_path) memory = Memory(max_size=qc.memory_size) for episode in trange(qc.num_episodes): # 試行数分繰り返す gs = GameState() state = gs.random_play() # 1step目は適当な行動をとる episode_reward = 0 targetQN.model.set_weights( mainQN.model.get_weights()) # 行動決定と価値計算のQネットワークをおなじにする for t in range(qc.max_number_of_steps): # 2手のループ board = gs.to_inputs() state, action = take_action_eps_greedy(board, episode, mainQN, gs) # 時刻tでの行動を決定する # next_state, reward, done, info = env.step(action) # 行動a_tの実行による、s_{t+1}, _R{t}を計算する # verbose ========== # if t % 10 == 9: # print(gs) # ================== if state == Winner.minus: reward = qc.reward_win # 報酬 else: reward = 0 next_board = gs.to_inputs() # board = next_board # 状態更新 # 1施行終了時の処理 if state != Winner.not_ended: episode_reward += reward # 合計報酬を更新 memory.add((board, action, reward, next_board)) # メモリの更新する # Qネットワークの重みを学習・更新する replay if len(memory) > qc.batch_size: # and not islearned: mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN) if qc.DQN_MODE: targetQN.model.set_weights( mainQN.model.get_weights()) # 行動決定と価値計算のQネットワークをおなじにする total_reward_vec = np.hstack( (total_reward_vec[1:], episode_reward)) # 報酬を記録 print( '%d/%d: Episode finished after %d time steps / mean %f winner: %s' % (episode + 1, qc.num_episodes, t + 1, total_reward_vec.mean(), 'plus' if state == Winner.plus else 'minus')) break state, _ = gs.random_play() if state == Winner.plus: reward = qc.reward_lose else: reward = 0 episode_reward += reward # 合計報酬を更新 memory.add((board, action, reward, next_board)) # メモリの更新する # Qネットワークの重みを学習・更新する replay if len(memory) > qc.batch_size: # and not islearned: mainQN.replay(memory, qc.batch_size, qc.gamma, targetQN) if qc.DQN_MODE: targetQN.model.set_weights( mainQN.model.get_weights()) # 行動決定と価値計算のQネットワークをおなじにする # 1施行終了時の処理 if state != Winner.not_ended: total_reward_vec = np.hstack( (total_reward_vec[1:], episode_reward)) # 報酬を記録 print( '%d/%d: Episode finished after %d time steps / mean %f winner: %s' % (episode + 1, qc.num_episodes, t + 1, total_reward_vec.mean(), 'plus' if state == Winner.plus else 'minus')) break # 複数施行の平均報酬で終了を判断 # if total_reward_vec.mean() >= goal_average_reward: # print('Episode %d train agent successfuly!' % episode) # islearned = True if episode % qc.save_interval == qc.save_interval - 1: d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') mainQN.save(f"results/001_QLearning/{d}-mainQN.json", f"results/001_QLearning/{d}-mainQN.h5") with open(f"results/001_QLearning/{d}-config.json", 'x') as f: json.dump(config._to_dict(), f, indent=4) # 最後に保存(直前にしていればしない) if episode % qc.save_interval != qc.save_interval - 1: d = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') mainQN.save(f"results/001_QLearning/{d}-mainQN.json", f"results/001_QLearning/{d}-mainQN.h5") with open(f"results/001_QLearning/{d}-config.json", 'x') as f: json.dump(config._to_dict(), f, indent=4)