def test_loss_cut_ab(seed=random.random()): create_ev_table(ev_table) print("seed", seed) winning_rate = 0.0 drow_count = 0 for i in range(10): random.seed(seed * i) state = State() est_ii_state = EstimatedState() est_ii_state.create_est_ii_state_from_state(state) while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 1: winning_rate += 1 # 先手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action, est_ii_state = cut_loss_alpha_beta_action( est_ii_state, 5) est_ii_state.my_real_next(state, action) else: action = alpha_beta_action(state) est_ii_state.enemy_real_next(action) print(state) state = state.next(action) # 先手後手を入れ替えて同じ条件で対戦 random.seed(seed * i) state = State() while True: if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 # 後手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = alpha_beta_action(state) est_ii_state.enemy_real_next(action) else: action = perfect_alpha_beta_action(state, 5) est_ii_state.my_real_next(state, action) state = state.next(action) print(winning_rate, (i + 1) * 2, drow_count)
def exp_reduction_effect( seed=random.random(), reduction_func=IDDFS_alpha_beta_action): # 状態の生成 create_ev_table(ev_table) print("seed", seed) reduction_ab_action = time_limit_alpha_beta(reduction_func) # 勝率を計測する方 simple_ab_action = time_limit_alpha_beta(alpha_beta_action) # 対戦相手 winning_rate = 0.0 drow_count = 0 for i in range(50): random.seed(seed * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 1: winning_rate += 1 # 先手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = reduction_ab_action(state) else: action = simple_ab_action(state) state = state.next(action) # 先手後手を入れ替えて同じ条件で対戦 random.seed(seed * i) state = State() while True: if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 # 後手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = simple_ab_action(state) else: action = reduction_ab_action(state) state = state.next(action) print(winning_rate, (i + 1) * 2, drow_count)
def exp_search_depth_effect(seed=random.random(), deep_depth=5, shallow_depth=3, search_func=alpha_beta_action): # 状態の生成 create_ev_table(ev_table) print("seed", seed) winning_rate = 0.0 drow_count = 0 for i in range(50): random.seed(seed * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 1: winning_rate += 1 # 先手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = search_func(state, deep_depth) # 深い探索 else: action = search_func(state, shallow_depth) # 浅い探索 state = state.next(action) # 先手後手を入れ替えて同じ条件で対戦 random.seed(seed * i) state = State() while True: if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 # 後手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = search_func(state, shallow_depth) # 浅い探索 else: action = search_func(state, deep_depth) # 深い探索 state = state.next(action) print(winning_rate, (i + 1) * 2, drow_count)
def play(next_actions_num): state = State() while True: if state.is_done(): break next_action_num = next_actions_num[0] if state.is_first_player( ) else next_actions_num[1] action_num = next_action_num(state) state.next(action_num) return first_player_point(state)
def exp_effect_of_search_depth(func_id=2, seed=random.random()): # 状態の生成 create_ev_table(ev_table, select_func(func_id)) print("seed", seed) gamma = 100000 # スレッショルドカットを実施しない depths = [2, 3, 4, 5, 6] for depth in depths: winning_rate = 0.0 drows_count = 0 for i in range(100): random.seed(seed * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 else: drows_count += 1 winning_rate += 0.5 break # 行動の取得 if state.is_first_player(): action = mcts_action(state) else: action = alpha_beta_action(state, gamma, depth) state = state.next(action) print("勝率", winning_rate, "drows_count=", drows_count)
def turn_of_human(self, touch): global state # ゲーム終了時 if state.is_done(): state = State() self.reset() return # 先手でない時 if not state.is_first_player(): return # クリック位置を行動に変換 x = int(touch.pos[0] / 160) y = int(touch.pos[1] / 160) action = x + y * 3 if x < 0 or 2 < x or y < 0 or 2 < y: # 範囲外 return # 合法手でない時 if not (action in state.legal_actions()): return # 次の状態の取得 state = state.next(action) # 丸追加 self.draw_piece(action) # AIのターン self.turn_of_ai()
def exp_value_changing(depth=5, func_id=3, gamma=1.0, seed=random.random()): record_values = [] # 評価値を記録 record_boards = [] # 評価値に連動して盤面を記録 for i in range(100): random.seed(seed * (i + 1)) state = State() ii_state = AccessableState() values = [] boards = [] while True: if state.is_done(): break if state.is_first_player(): action = move_ordering_alpha_beta_action(state, 1, depth, i) # 盤面の評価値を算出し記録 ii_state.create_ii_state_from_state(state) values.append(evaluate_board_state(ii_state)) boards.append([state.pieces, state.enemy_pieces]) else: action = random_action(state) state = state.next(action) record_values.apped(values) record_boards.apped(boards) # TODO: csvに出力する print(record_values) print(record_boards)
def exp_fair_compete(depth=5, func_id=3, seed=random.random()): gamma = 100000 # スレッショルドカットを実施しない restricts = [True, False] print(seed) for restrict in restricts: create_ev_table(ev_table, select_func(func_id)) winning_rate = 0.0 drows_count = 0 for i in range(100): random.seed(seed * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 else: winning_rate += 0.5 drows_count += 1 break # 行動の取得 if state.is_first_player(): action = alpha_beta_action(state, gamma, depth, not restrict) else: action = alpha_beta_action(state, gamma, depth, restrict) state = state.next(action) print("制限", restrict, "のエージェントが後手の際の勝率") print(winning_rate, "drows_count=", drows_count)
def exp_effect_of_action_restrict_for_compete(depth=5, func_id=2, rdm=random.random()): gamma = 100000 # スレッショルドカットを実施しない restricts = [True, False] for restrict in restricts: create_ev_table(ev_table, select_func(func_id)) winning_rate = 0.0 drows_count = 0 for i in range(100): random.seed(rdm * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 else: winning_rate += 0.5 drows_count += 1 break # 行動の取得 if state.is_first_player(): action = ii_mcts_action(state) else: action = alpha_beta_action(state, gamma, depth, restrict) state = state.next(action) print("restrict", restrict) print(winning_rate, "drows_count=", drows_count)
def exp_effect_of_action_restrict_for_time(depth=5, func_id=2): gamma = 100000 # スレッショルドカットを実施しない create_ev_table(ev_table, select_func(func_id)) # 評価関数は固定 state = State() restrict_time = 0.0 no_restrict_time = 0.0 while True: # ゲーム終了時 if state.is_done(): break # 行動の取得 if state.is_first_player(): action = random_action(state) # ランダム行動 else: # 行動数の削減あり start = time.time() for _ in range(50): action = alpha_beta_action(state, gamma, depth, True) restrict_time += time.time() - start # 行動数の削減なし start = time.time() for _ in range(50): action = alpha_beta_action(state, gamma, depth, False) no_restrict_time += time.time() - start action = random_action(state) # お互いにランダム行動をさせる state = state.next(action) print("restrict:", restrict_time, "no_restrict:", no_restrict_time)
def exp_effect_of_search_depth(): gamma = 100000 # スレッショルドカットを実施しない rdm = random.random() for func_id in range(8): create_ev_table(ev_table, select_func(func_id)) winning_rate = 0.0 drows_count = 0 for i in range(100): random.seed(rdm * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 else: winning_rate += 0.5 drows_count += 1 break # 行動の取得 if state.is_first_player(): action = ii_mcts_action(state) else: action = alpha_beta_action(state, gamma, 5) state = state.next(action) print(winning_rate, "id=", func_id, "drows_count=", drows_count)
def play(model): history = [] state = State() while True: if state.is_done(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE) policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) action = np.random.choice(state.legal_actions(), p=scores) state = state.next(action) value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): # 学習データ history = [] # 状態の生成 state = State() while True: # ゲーム終了時 if state.is_done(): break # 合法手の確率分布の取得 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) # 行動の取得 action = np.random.choice(state.legal_actions(), p=scores) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): # 학습 데이터 history = [] # 상태 생성 state = State() while True: # 게임 종료 시 if state.is_done(): break # 합법적인 수의 확률 분포 얻기 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 학습 데이터에 상태와 정책 추가 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([[state.pieces, state.enemy_pieces], policies, None]) # 행동 얻기 action = np.random.choice(state.legal_actions(), p=scores) # 다음 상태 얻기 state = state.next(action) # 학습 데이터에 가치 추가 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def exp_gamma_time(depth=5, func_id=2, seed=random.random()): print("seed", seed) random.seed(seed) state = State() create_ev_table(ev_table, select_func(func_id)) keep_gamma_time = [0] * 30 # ゲーム終了までのループ while True: # ゲーム終了時 if state.is_done(): break # 行動の取得 if state.is_first_player(): action = random_action(state) else: gamma = 0.0 for index, _ in enumerate(keep_gamma_time): start = time.time() for _ in range(100): # action = alpha_beta_action(state, gamma) action = alpha_beta_action(state, gamma, depth, False) keep_gamma_time[index] += time.time() - start gamma += 0.1 # データをばらつかせるためにランダム行動をとる action = random_action(state) print(keep_gamma_time) # 次の状態の取得 state = state.next(action)
def vs_mcts(ev_func, seed, buttle_num): winning_rate = 0.0 drow_count = 0 for i in range(buttle_num): random.seed(seed * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 1: winning_rate += 1 # 先手勝ち else: # 引き分け winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = alpha_beta_action(state, ev_func, 5) else: action = mcts_action(state) state = state.next(action) # 先手後手を入れ替えて同じ条件で対戦 random.seed(seed * i) state = State() while True: if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 # 後手勝ち else: winning_rate += 0.5 drow_count += 1 break # 行動の取得 if state.is_first_player(): action = mcts_action(state) else: action = alpha_beta_action(state, ev_func, 5) state = state.next(action) print(winning_rate, drow_count) return winning_rate
def play(model, using_saved_state=False, saving_ontheway_state=False): ''' 1ゲームの実行 ''' # 学習データ history = [] # 状態の生成 if using_saved_state: state = load_state() if not state: state = State() else: state = State() starttime = time.time() print('') while True: # ゲーム終了時 if state.is_done(): endtime = time.time() print("first player is ", "lose" if state.is_lose() else "win") print("first player num:", state.piece_count(state.pieces)) print('elapsed time', endtime - starttime) print(state) break # 合法手の確率分布の取得 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([[state.pieces, state.enemy_pieces], policies, None]) # 行動の取得 if len(history) % 10 == 0: print("state len: ", len(history)) print(state) if saving_ontheway_state and len(history) == 25: save_state(state) action = np.random.choice(state.legal_actions(), p=scores) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): # 学習データ history = [] # 状態の生成 state = State() while True: # ゲーム終了時 if state.is_done(): break # 合法手の確率分布の取得 scores, values = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy # 行動の取得 action = np.random.choice(state.legal_actions(), p=scores) # state, policy, value, 探索結果, 選ばれた手、それから先の局面 history.append([[state.pieces, state.enemy_pieces], policies, None, values, action, None]) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value # 最後の局面情報を取っておく last_state = history[-1][0] last_policy = [0] * DN_OUTPUT_SIZE v0 = history[0][2] v1 = history[1][2] for i in range(len(history)): rp = [] for inc in range(3): index = i + inc if index < len(history): rp.append(history[i + inc]) else: v = v0 if ((i + inc) % 2) == 0 else v1 a = randint(9) rp.append([last_state, last_policy, v, v, a, None]) history[i][5] = rp return history
def play(model): history = [] state = State() while True: if state.is_done(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE) with open('action_list.txt', 'rb') as f: action_list = pickle.load(f) # print('action_list:', len(action_list)) policies = np.zeros(len(action_list)) # for action_num, policy in zip(state.legal_actions(), scores): # policies[action_num] = policy # print('size check', len(policies), len(scores)) legal_actions = state.legal_actions() for i in range(len(legal_actions)): policies[legal_actions[i]] = scores[i] # print(policies) # print('policies:', policies) history.append([[state.pieces, state.enemy_pieces], policies, None]) # action_list_num = np.arange(len(action_list)) # action_num = np.random.choice(action_list_num, p=scores) action_num = np.random.choice(legal_actions, p=scores) # print(action_num) state.next(action_num) value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(next_actions): state = State() while True: if state.is_done(): break next_action = next_actions[0] if state.is_first_player() else next_actions[1] action = next_action(state) state = state.next(action) return first_player_point(state)
def play(next_actions): # 상태 생성 state = State() # 게임 종료 시까지 반복 while True: # 게임 종료 시 if state.is_done(): break # 행동 얻기 next_action = next_actions[0] if state.is_first_player() else next_actions[1] action = next_action(state) # 다음 상태 얻기 state = state.next(action) # 선 수 플레이어의 포인트 반환 return first_player_point(state)
def exp_move_ordering_time(depth=5, func_id=3, gamma=1.0, seed=random.random()): print("seed", seed) timer = [0.0] * (depth + 1) random.seed(seed) state = State() while True: if state.is_done(): break for i in range(depth + 1): start = time.time() for _ in range(1): move_ordering_alpha_beta_action(state, 1, depth, i) timer[i] += time.time() - start action = random_action(state) state = state.next(action) # ランダムにゲームを進める
def play(next_actions): # 状態の生成 state = State() # ゲーム終了までループ while True: # ゲーム終了時 if state.is_done(): break; # 行動の取得 next_action = next_actions[0] if state.is_first_player() else next_actions[1] action = next_action(state) # 次の状態の取得 state = state.next(action) # 先手プレイヤーのポイントを返す return first_player_point(state)
def play(next_actions) -> float: """1ゲームの実行""" state = State() # 状態の生成 # ゲーム終了までループ while True: if state.is_done(): break # 行動の取得 next_action = next_actions[0] if state.is_first_player( ) else next_actions[1] action = next_action(state) # 次の状態の取得 state = state.next(action) # 先手プレイヤーのポイントを返す return first_player_point(state)
def test_dual_network(): model0 = RepNet() model1 = DynamicsNet() model2 = PredictNet() model0.load_state_dict(torch.load('./model/best_r.h5')) model1.load_state_dict(torch.load('./model/best_d.h5')) model2.load_state_dict(torch.load('./model/best_p.h5')) model0 = model0.double() model1 = model1.double() model2 = model2.double() state = State() action = 0 next_state = state.next(action) file, rank, channel = DN_INPUT_SHAPE x = np.array([state.pieces, state.enemy_pieces]) x = x.reshape(channel, file, rank) x = np.array([x]) x = torch.tensor(x, dtype=torch.double) hidden = model0(x) action = np.array([0]) at = action_to_tensor(action) hidden = model1(hidden, at) print("----------------------------------") policy, value = model2(hidden) print(policy.shape) print(value.shape) print(hidden.shape)
def exp_gamma_winning_rate(depth=5, func_id=2, seed=random.random()): # 状態の生成 create_ev_table(ev_table, select_func(func_id)) keep_gamma_winning_rate = [0] * 30 print("seed", seed) gamma = 0.0 for index, _ in enumerate(keep_gamma_winning_rate): winning_rate = 0.0 for i in range(100): random.seed(seed * i) state = State() while True: # ゲーム終了時 if state.is_done(): if state.is_lose(): if state.depth % 2 == 0: winning_rate += 1 # 後手勝ち # elif state.depth % 2 == 1: # pass # 先手勝ち else: # 引き分け winning_rate += 0.5 break # 行動の取得 if state.is_first_player(): # action = random_action(state) action = mcts_action(state) else: # action = alpha_beta_action(state, gamma) action = alpha_beta_action(state, gamma, depth, False) state = state.next(action) keep_gamma_winning_rate[index] = winning_rate print(keep_gamma_winning_rate) gamma += 0.1 print(keep_gamma_winning_rate)
def play(model): # 학습 데이터 history = [] # 상태 생성 state = State() while True: # 게임 종료 시 if state.is_done(): break # 합법적인 수의 확률 분포 얻기 # (모델, 게임 상태, 온도파라미터:변동성을주기위해사용하는변수) # 각 노드의 점수가 계산 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 학습 데이터에 상태와 정책 추가 policies = [0] * DN_OUTPUT_SIZE # 행동수 :7 # 돌을 놓을수 있는 후보지, 점수를 넣어서 for action, policy in zip(state.legal_actions(), scores): # 행동과 정책을 세팅 # 어떤 열에 정책 세팅 policies[action] = policy # 내역을 기록 ( [내돌상태, 적돌상태], 정책, None(점수)) history.append([[state.pieces, state.enemy_pieces], policies, None]) # 행동 얻기 action = np.random.choice(state.legal_actions(), p=scores) # 다음 상태 얻기 state = state.next(action) # 학습 데이터에 가치 추가 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def turn_of_human(self, touch): global state # ゲーム終了時 if state.is_done(): state = State() self.reset() return # 先手でない時 if not state.is_first_player(): return # クリック位置を行動に変換 x = int(touch.pos[0] / 80) y = int(touch.pos[1] / 80) action = x + y * 6 if x < 0 or 5 < x or y < 0 or 5 < y: # 範囲外 return # 合法手でない時 legal_actions = state.legal_actions() if legal_actions == [36]: action = 36 # パス if action != 36 and not (action in legal_actions): return # 次の状態の取得 state = state.next(action) # 丸追加 self.draw_piece() sleep(1) # AIのターン self.turn_of_ai()
def keisoku(): buttle_num = 0 for _ in range(100): # 状態の生成 state = State() create_ev_table(ev_table) create_red_ev_table(red_ev_table) # 大実験や keep_info = KeepInfo() counter = [0] * 148 # ゲーム終了までのループ while True: # ゲーム終了時 if state.is_done(): buttle_num += 1 break # 行動の取得 if state.is_first_player(): action, counter = check_unnecessary_action(state, counter) else: action, counter = check_unnecessary_action(state, counter) state = state.next(action) print(buttle_num, "戦目") print(counter)
# else: # pass if __name__ == "__main__": os.environ["OMP_NUM_THREADS"] = "1" with open("config.yaml") as f: args = yaml.safe_load(f) # print(args) # ここに実験用のコードを書く state = State() while True: print(state.legal_actions()) state = state.next(random_action(state)) # path = "models/10000.pth" # EvalHandyRL(100, path) # policies = obs_to_policy_to_use_game(agent, obs, state) # print(policies) # convert_state_to_obs(state) # test_predict() # test_cigeister() # 方策を持ってくる # 接続部分