def rtn_q(self, ban, model, player_side, action): q = None ban_copy = copy.deepcopy(ban) ban_copy.ban_applay(player_side, action[0], action[1]) if ban_copy.ban_win(player_side, action[0], action[1]): #print("win") q = 1 elif ban_copy.ban_fill(): #print("fill") q = 0 else: state = chg_input_cnn(ban_copy, 1-player_side) p_ary , _ = model(state.to(self.device)) p_ary = p_ary.detach().cpu().numpy()[0] p_ary_index = np.argsort(p_ary)[::-1] ban_put_available = ban_copy.ban_put_available() #print(ban_put_available) lose_flg = 0 for [r_op,c_op] in ban_put_available: ban_copy2 = copy.deepcopy(ban_copy) ban_copy2.ban_applay(1-player_side, r_op,c_op) if ban_copy2.ban_win(1-player_side, r_op,c_op): lose_flg = 1 #print("lose 確定") if lose_flg == 0: for index in p_ary_index: r_op,c_op = index2rc(index) if [r_op,c_op] in ban_put_available: break #print("相手が打つ場所", r_op,c_op) ban_copy.ban_applay(1-player_side, r_op,c_op) #ban_copy.ban_print() if ban_copy.ban_win(1-player_side, r_op,c_op): #print("lose") q = -1 elif ban_copy.ban_fill(): #print("fill op") q = 0 else: #print("other") state = chg_input_cnn(ban_copy, player_side) p_ary , _ = model(state.to(self.device)) p_ary = p_ary.detach().cpu().numpy()[0] put_available_position = ban_copy.rtn_put_available_position() #print(p_ary) #print(put_available_position) #print(p_ary + put_available_position) q = np.max(p_ary + put_available_position) return q
def check_win_rate_random_ai_first(Env, brain, model, max_episode): #勝率を計算する win_0 = 0 win_1 = 0 hiki = 0 ban = Env(BANHEN, WINREN) brain = brain for episode in range(max_episode): print("\rstep : {0}/{1} ".format(episode, max_episode), end="") ban.ban_reset() step = 0 while True: step += 1 #print('player 1') player_side = 1 state = chg_input_cnn(ban, player_side) action, _ = decide_action_func(model, ban, state) ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player1 win!!') win_1 += 1 break if ban.ban_fill(): hiki += 1 break #print('player 0 random') player_side = 0 action = random.choice(ban.ban_put_available()) #action = ban.ban_put_available()[0] ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player0 win!!') win_0 += 1 break if ban.ban_fill(): hiki += 1 break #print('episode: {}/{}, win_0(AI 0): {}({}%), win_1(AI 1): {}({}%), step: {}' # .format(episode+1, max_episode, win_0, int(100*win_0/(episode+1)),win_1,int(100*win_1/(episode+1)), step)) win_rate = 100 * win_1 / (max_episode) not_lose_rate = 100 * (win_1 + hiki) / (max_episode) return win_rate, not_lose_rate
def check_win_rate_put_1st(Env, brain, model, max_episode): #indexが小さいところから順に売っていく 負けなかった確率を返す not_win_0 = 0 not_win_1 = 0 ban = Env(BANHEN, WINREN) brain = brain for episode in range(1): ban.ban_reset() step = 0 while True: step += 1 #print('player 0 random') player_side = 0 #action = random.choice(ban.ban_put_available()) action = ban.ban_put_available()[0] ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player0 win!!') not_win_0 += 1 break if ban.ban_fill(): not_win_0 += 1 break #print('player 1') player_side = 1 state = chg_input_cnn(ban, player_side) action, _ = decide_action_func(model, ban, state) ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player1 win!!') not_win_1 += 1 break if ban.ban_fill(): not_win_1 += 1 break #print('episode: {}/{}, win_0(AI 0): {}({}%), win_1(AI 1): {}({}%), step: {}' # .format(episode+1, max_episode, win_0, int(100*win_0/(episode+1)),win_1,int(100*win_1/(episode+1)), step)) win_rate = 100 * not_win_1 / (max_episode) return win_rate
step = 0 #何手目か step_sum = 0 gen_num = 0 #モデルの初期値 episode_sum = 0 #エピソードの累積 search_depth = 3 ep_random_data = 0 log_print("lrはtextファイルから読み取り") log_print('start : ' + model_filename) start_time = datetime.datetime.now() log_print("start time") log_print(start_time) #print(brain.main_model) dummy_input = chg_input_cnn(ban, 0) #print(dummy_input.size()) #dummy_model = nn.DataParallel(NeuralNet_cnn(BANHEN, BANSIZE)).to(device) #writer_x.add_graph(dummy_model) model = NeuralNet_cnn(BANHEN, BANSIZE) writer_x.add_graph(model, (dummy_input, ), verbose=True) if __name__ == '__main__': while train_is_continue: for episode in range(NUM_EPISODES): # 最大試行数分繰り返す episode_sum += 1 ban.ban_reset() step = 0 #stepをリセット terminal = False #terminalをリセット
def check_win_rate_ai(Env, brain, main_model, new_model, max_episode): win_main = 0 draw = 0 win_new = 0 ban = Env(BANHEN, WINREN) for episode in range(max_episode): print("\rstep : {0}/{1} ".format(episode, max_episode), end="") ban.ban_reset() step = 0 while True: #main_model先行 step += 1 #print('player 0') player_side = 0 state = chg_input_cnn(ban, player_side) if step <= 1: #ランダムに打つ action = random.choice(ban.ban_put_available()) else: action, _ = decide_action_func(main_model, ban, state) ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player0 win!!') win_main += 1 break if ban.ban_fill(): draw += 1 break #print('player 1') player_side = 1 state = chg_input_cnn(ban, player_side) if step <= 1: action = random.choice(ban.ban_put_available()) else: action, _ = decide_action_func(new_model, ban, state) ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player1 win!!') win_new += 1 break if ban.ban_fill(): draw += 1 break ban.ban_reset() step = 0 while True: #new_model先行 step += 1 #print('player 0 random') player_side = 0 state = chg_input_cnn(ban, player_side) if step <= 1: #ランダムに打つ action = random.choice(ban.ban_put_available()) else: action, _ = decide_action_func(new_model, ban, state) ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player0 win!!') win_new += 1 break if ban.ban_fill(): draw += 1 break #print('player 1') player_side = 1 state = chg_input_cnn(ban, player_side) if step <= 1: #ランダムに打つ action = random.choice(ban.ban_put_available()) else: action, _ = decide_action_func(main_model, ban, state) ban.ban_applay(player_side, action[0], action[1]) #print(action) #ban.ban_print() if ban.ban_win(player_side, action[0], action[1]): #print('player1 win!!') win_main += 1 break if ban.ban_fill(): draw += 1 break win_rate = 100 * (win_new) / (win_main + win_new) return win_rate
def decide_action(self, ban, model, player_side, search_depth, step,episode_sum ,ep_random_data ,fastmode=False): reward = 0 if fastmode:#デバック用 NNを使わない #print("fastmode") action = random.choice(ban.ban_put_available()) # 行動をランダムに返す r = action[0] c = action[1] v_ary = np.zeros(self.BANSIZE) v_output = 0 reward, r, c = 0, r, c ban_copy = copy.deepcopy(ban) ban_copy.ban_applay(player_side, r, c)#自分が打つ if ban_copy.ban_fill(): terminal = True else: terminal = False return reward, r, c, None, None, terminal #print("fastmodeじゃないよ...") sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * episode_sum / EPS_DECAY) self.eps_threshold = eps_threshold ''' if step <= 1: eps_threshold = max(0.6, eps_threshold) elif step == 2: eps_threshold = max(0.3, eps_threshold) elif step == 3: eps_threshold = max(0.1, eps_threshold) elif step == 4: eps_threshold = max(0.1, eps_threshold) elif step >= 5: eps_threshold = max(0.05, eps_threshold) ''' if sample > eps_threshold:# and step >= 1 #手を探索する #p_ary= self.searchGameTree(ban, model, player_side, search_depth) #indexを大きい順に並べる ''' p_ary_index = np.argsort(p_ary)[::-1] #print("vがmaxのindex = {}".format([int(v_ary_index[0]/14), v_ary_index[0] % 14])) ban_put_available = ban.ban_put_available() #print(ban_put_available) for index in p_ary_index: r,c = index2rc(index) if [r, c] in ban_put_available: q = p_ary[index] return r, c, p_ary, q ''' reward, r, c, state, terminal = self.rtn_reward(ban, model, player_side) return reward, r, c, state, terminal else: #print("ランダム打ち") action = random.choice(ban.ban_put_available()) # 行動をランダムに返す r = action[0] c = action[1] #_, win_flag, p_ary, _, _, _= self.rtn_p_ary(ban, model, player_side, 0) index = rc2index(r,c) #q = p_ary[index] #for g in range(14): # for r in range(14): # print("{:0=+03.3f} ".format(v_ary[14*g + r]), end="") # # print('') reward, r, c = 0, r, c ban_copy = copy.deepcopy(ban) state = chg_input_cnn(ban_copy, player_side) ban_copy.ban_applay(player_side, r, c)#自分が打つ if ban_copy.ban_fill(): terminal = True reward = 0 return reward, r, c, state, terminal elif ban_copy.ban_win(player_side, r, c): reward = 1 terminal = True #print("win") return reward, r, c, state, terminal else: reward = 0 terminal = False #print("continue") return reward, r, c, state, terminal
def rtn_reward(self, ban, model, player_side): reward = 0 ban_copy = copy.deepcopy(ban) state = chg_input_cnn(ban_copy, player_side) p_ary , _ = model(state.to(self.device)) p_ary = p_ary.detach().cpu().numpy()[0] ban_put_available = ban_copy.ban_put_available() #print("--------------") #print("player_side", player_side) #ban.ban_print() #print(p_ary) for i in range(random_search_value): #print(ban_put_available) q_ary_for_w = [] for [r,c] in ban_put_available: index = rc2index(r,c) q_ary_for_w.append(p_ary[index]) w = self.softmax_numpy(q_ary_for_w, 1/(1+3*i)) #print("q_ary_for_w", q_ary_for_w) #print("weights", w) #print("ban_put_available", ban_put_available) #action = random.choices(ban_put_available, weights=w)[0] action = random.choices(ban_put_available)[0] #print(action) q = self.rtn_q(ban, model, player_side, action) index = rc2index(action[0], action[1]) #print(i, action, q) p_ary[index] = q*0.99 + p_ary[index]*(1-0.99) #print(p_ary) p_ary_index = np.argsort(p_ary)[::-1] #print(ban_put_available) for index in p_ary_index: r,c = index2rc(index) if [r, c] in ban_put_available: break ban_copy.ban_applay(player_side, r, c) #ban_copy.ban_print() if ban_copy.ban_win(player_side, r, c): reward = 1 terminal = True #print("win") return reward, r, c, state, terminal elif ban_copy.ban_fill(): #print("もう打てないよ!!") reward = 0 terminal = True #print("fill") return reward, r, c, state, terminal else: reward = 0 terminal = False #print("continue") return reward, r, c, state, terminal