def self_play(tracker_queue, net, replay_queue, probs_queue, loop_count, device="cpu"): mcts_stores = [mcts.MCTS(), mcts.MCTS()] for j in range(SELF_PLAY_PERIOD): i = loop_count * SELF_PLAY_PERIOD + j t = time.perf_counter() status = "" _, steps = model.play_game(mcts_stores, replay_queue, probs_queue, net, net, steps_before_tau_0=STEPS_BEFORE_TAU_0, mcts_searches=MCTS_SEARCHES, mcts_batch_size=MCTS_BATCH_SIZE, device=device, status=status) game_steps = steps dt = time.perf_counter() - t speed_steps = game_steps / dt status = "episode #{}, steps {:3d}, processing time {:5.2f} [s], steps/s {:5.2f}".format( i, steps, dt, speed_steps) tracker_queue.put(("speed_steps", speed_steps, i)) print("episode #%d, steps %3d, steps/s %5.2f" % (i, game_steps, speed_steps))
def evaluate(net1, net2, rounds, device="cpu"): n1_win, n2_win = 0, 0 mcts_stores = [mcts.MCTS(), mcts.MCTS()] for r_idx in range(rounds): r, _ = model.play_game(mcts_stores=mcts_stores, replay_buffer=None, net1=net1, net2=net2, steps_before_tau_0=0, mcts_searches=20, mcts_batch_size=16, device=device) if r < -0.5: n2_win += 1 elif r > 0.5: n1_win += 1 return n1_win / (n1_win + n2_win)
def eval(val, lock, net1, net2, device, cpuf): if cpuf: net1.to(device); net2.to(device) mcts_stores = [mcts.MCTS(), mcts.MCTS()] while True: are = random.randrange(0, 2) r, _ = model.play_game(val, mcts_stores, None, net1=net1 if are<1 else net2, net2=net2 if are<1 else net1, steps_before_tau_0=20, mcts_searches=40, mcts_batch_size=40, best_idx=-1, device=device) bf = False lock.acquire() if r!=None: val[1 if (r > 0.5 and are<1) or (r<-0.5 and are>=1) else 2] += 1 print("%d:%d %d/%d"%(are,r,val[1],val[2]),end=' ', flush=True) if (val[1]+val[2]) % 5 <1: print() if val[0]<=0: bf=True lock.release() if bf: break
def __init__(self, model_file, player_moves_first, player_id): self.model_file = model_file self.model = model.Net(input_shape=model.OBS_SHAPE, actions_n=game.GAME_COLS) self.model.load_state_dict(torch.load(model_file, map_location=lambda storage, loc: storage)) self.state = game.INITIAL_STATE self.value = None self.player_moves_first = player_moves_first self.player_id = player_id self.moves = [] self.mcts_store = mcts.MCTS()
def evaluate(net1, net2, rounds, device="cpu"): n1_win, n2_win = 0, 0 mcts_stores = [mcts.MCTS(), mcts.MCTS()] for r_idx in range(rounds): r, step = model.play_game(None, mcts_stores, None, net1 if r_idx < rounds // 2 else net2, net2 if r_idx < rounds // 2 else net1, steps_before_tau_0=game.MAX_TURN, mcts_searches=40, mcts_batch_size=40, best_idx=-1, device=device) if (r > 0 and r_idx < rounds // 2) or (r < 0 and r_idx >= rounds // 2): n1_win += 1 if r != 0: n2_win += 1 print(r_idx, r, step) return (n1_win / n2_win) if n2_win > 0 else 0.5
parser.add_argument("--cuda", default=False, action="store_true", help="Enable CUDA") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") saves_path = os.path.join("saves", args.name) os.makedirs(saves_path, exist_ok=True) writer = SummaryWriter(comment="-" + args.name) net = model.Net(input_shape=model.OBS_SHAPE, actions_n=game.GAME_COLS).to(device) best_net = ptan.agent.TargetNet(net) print(net) optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, momentum=0.9) replay_buffer = collections.deque(maxlen=REPLAY_BUFFER) mcts_store = mcts.MCTS() step_idx = 0 best_idx = 0 with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: t = time.time() prev_nodes = len(mcts_store) game_steps = 0 for _ in range(PLAY_EPISODES): _, steps = model.play_game(mcts_store, replay_buffer, best_net.target_model, best_net.target_model, steps_before_tau_0=STEPS_BEFORE_TAU_0, mcts_searches=MCTS_SEARCHES, mcts_batch_size=MCTS_BATCH_SIZE, device=device) game_steps += steps game_nodes = len(mcts_store) - prev_nodes dt = time.time() - t
def play_game(mcts_stores, replay_buffer, net1, net2, steps_before_tau_0, mcts_searches, mcts_batch_size, net1_plays_first=None, cuda=False): """ Play one single game, memorizing transitions into the replay buffer :param mcts_stores: could be None or single MCTS or two MCTSes for individual net :param replay_buffer: queue with (state, probs, values), if None, nothing is stored :param net1: player1 :param net2: player2 :return: value for the game in respect to player1 (+1 if p1 won, -1 if lost, 0 if draw) """ assert isinstance(replay_buffer, (collections.deque, type(None))) assert isinstance(mcts_stores, (mcts.MCTS, type(None), list)) assert isinstance(net1, Net) assert isinstance(net2, Net) assert isinstance(steps_before_tau_0, int) and steps_before_tau_0 >= 0 assert isinstance(mcts_searches, int) and mcts_searches > 0 assert isinstance(mcts_batch_size, int) and mcts_batch_size > 0 if mcts_stores is None: mcts_stores = [mcts.MCTS(), mcts.MCTS()] elif isinstance(mcts_stores, mcts.MCTS): mcts_stores = [mcts_stores, mcts_stores] state = game.INITIAL_STATE nets = [net1, net2] if net1_plays_first is None: cur_player = np.random.choice(2) else: cur_player = 0 if net1_plays_first else 1 step = 0 tau = 1 if steps_before_tau_0 > 0 else 0 game_history = [] result = None net1_result = None while result is None: mcts_stores[cur_player].search_batch(mcts_searches, mcts_batch_size, state, cur_player, nets[cur_player], cuda=cuda) probs, _ = mcts_stores[cur_player].get_policy_value(state, tau=tau) game_history.append((state, cur_player, probs)) action = np.random.choice(game.GAME_COLS, p=probs) if action not in game.possible_moves(state): print("Impossible action selected") state, won = game.move(state, action, cur_player) if won: result = 1 net1_result = 1 if cur_player == 0 else -1 break cur_player = 1-cur_player # check the draw case if len(game.possible_moves(state)) == 0: result = 0 net1_result = 0 break step += 1 if step >= steps_before_tau_0: tau = 0 if replay_buffer is not None: for state, cur_player, probs in reversed(game_history): replay_buffer.append((state, cur_player, probs, result)) result = -result return net1_result, step
def play_game(net1, steps_before_tau_0, mcts_batch_size, device="cpu"): assert isinstance(net1, model.Net) assert isinstance(steps_before_tau_0, int) and steps_before_tau_0 >= 0 assert isinstance(mcts_batch_size, int) and mcts_batch_size > 0 global mcts_searches pan = game.encode_lists([list(i) for i in game.INITIAL_STATE], 0) historystr = [] cur_player = 0 step = 0 mctsi = mcts.MCTS() result = None; exitf = False a0 = ord('1') while True: s=input('플레이하려는 진영을 선택하세요 0) 초, 1)한 ?') if s.find('level') >= 0: mcts_searches = LEVELC * int(s[6:]) print('OK', flush=True) else: player_human = 0 if int(s)<1 else 1 break while result is None: movelist = game.possible_moves(pan, cur_player, step) if step>9 and historystr[-4][:90]==historystr[-8][:90]: p = game.decode_binary(pan) for idx, m in enumerate(movelist): spos = m // 100; tpos = m % 100; y0 = spos // 9; x0 = spos % 9; y1 = tpos // 9; x1 = tpos % 9 captured = p[y1][x1]; p[y1][x1] = p[y0][x0]; p[y0][x0] = 0 ps = game.encode_lists(p, step+1) if ps[:90]==historystr[-4][:90]: del movelist[idx]; break p[y0][x0] = p[y1][x1]; p[y1][x1] = captured if (step<2 and cur_player != player_human) or (step>1 and cur_player == player_human): if step < 2: print("마상 차림을 선택하세요 0) "+masang[0]+", 1) "+masang[1]+", 2) "+masang[2]+", 3) "+masang[3]) else: render(pan, player_human) if step==2 or step==3: print("") print("옮기고자 하는 기물의 세로 번호, 가로 번호, 목적지의 세로 번호, 가로 번호 ex) 0010 한수 쉬기: 0") action = -1 while action<0: s=input((str(step-1) if step>1 else '')+' ? ') if s=="new": exitf=True; break elif s.find('level')>=0: mcts_searches=LEVELC*int(s[6:]); print('OK', flush=True) elif step<2: if len(s)==1 and s[0]>='0' and s[0]<'4': action = int(s) + 10000 elif len(s)==1: action = 0 elif s=='undo' and step>3: step-=2; historystr.pop(); historystr.pop(); pan=historystr[-1] movelist = game.possible_moves(pan, cur_player, step) render(pan, player_human) elif len(s)==4 and s[0]>='0' and s[0]<='9' and s[1]>'0' and s[1]<='9' and s[2]>='0' and s[2]<='9' and s[3]>'0' and s[3]<='9': b1=9-ord(s[0])+a0 if s[0]>'0' else 0 if player_human<1: b1=9-b1 b2 = ord(s[1]) - a0 if player_human < 1: b2 = 8 - b2 b3 = 9-ord(s[2]) + a0 if s[2]>'0' else 0 if player_human < 1: b3 = 9 - b3 b4 = ord(s[3]) - a0 if player_human < 1: b4 = 8 - b4 action = (b1*9 + b2)*100 + b3*9+b4 if action not in movelist: action = -1 else: print('OK', flush=True) else: mctsi.search_batch(mcts_searches, mcts_batch_size, pan, cur_player, net1, step, device=device) probs, values = mctsi.get_policy_value(pan, movelist, cur_player) chList = actionTable.choList if cur_player < 1 else actionTable.hanList n = np.random.choice(actionTable.AllMoveLength, p=probs) if step<steps_before_tau_0 else np.argmax(probs) action = chList[n] """for m in movelist: print('%04d %.2f' % (m, probs[chList.index(m)]), end=', ') print()""" if step<2: print(('한: ' if step<1 else '초: ')+masang[action-10000]+' '+str(values[n]), flush=True) if step==1: render(pan, player_human) else: if action<1: print('한수쉼'+' '+str(values[n])) else: b1=action//100//9 if player_human<1: b1=9-b1 b2 = action//100%9 if player_human < 1: b2 = 8 - b2 b3 = action%100//9 if player_human < 1: b3 = 9 - b3 b4 = action%100%9 if player_human < 1: b4 = 8 - b4 print((chr(9-b1+a0) if b1>0 else '0')+chr(b2+a0)+(chr(9-b3+a0) if b3>0 else '0')+chr(b4+a0)+' '+str(values[n])) if exitf: break pan, won = game.move(pan, action, step) historystr.append(pan) if won>0: render(pan, player_human) print(('초' if won==1 else '한')+' 승') break cur_player = 1-cur_player step += 1
def play_game(value, mcts_stores, queue, net1, net2, steps_before_tau_0, mcts_searches, mcts_batch_size, best_idx, url=None, username=None, device="cpu"): assert isinstance(mcts_stores, (mcts.MCTS, type(None), list)) assert isinstance(net1, Net) assert isinstance(net2, Net) assert isinstance(steps_before_tau_0, int) and steps_before_tau_0 >= 0 assert isinstance(mcts_searches, int) and mcts_searches > 0 assert isinstance(mcts_batch_size, int) and mcts_batch_size > 0 if mcts_stores is None: mcts_stores = [mcts.MCTS(), mcts.MCTS()] elif isinstance(mcts_stores, mcts.MCTS): mcts_stores.clear() mcts_stores = [mcts_stores, mcts_stores] else: mcts_stores[0].clear(); mcts_stores[1].clear() state = game.encode_lists([list(i) for i in game.INITIAL_STATE], 0) nets = [net1, net2] cur_player = 0 step = 0 tau = 1 if steps_before_tau_0 > 0 else 0 game_history = [] net1_result = None result = None while net1_result is None and (value==None or value[0]>0): mcts_stores[cur_player].search_batch(mcts_searches, mcts_batch_size, state, cur_player, nets[cur_player], step, device=device) movel = game.possible_moves(state, cur_player, step) probs, _ = mcts_stores[cur_player].get_policy_value(state, movel, cur_player, tau=tau) chList = actionTable.choList if cur_player < 1 else actionTable.hanList action = chList[np.random.choice(actionTable.AllMoveLength, p=probs)] game_history.append((action, probs) if queue is None else (state, step, probs)) if action not in movel: print("Impossible action selected") state, won = game.move(state, action, step) if step%3<1: print('.', end='', flush=True) if won>0: net1_result = 1 if won == 1 else -1 result = -net1_result break step += 1 cur_player = 1-cur_player if step >= steps_before_tau_0: tau = 0 if net1_result !=None: print() if queue is not None: dequeuef = isinstance(queue, collections.deque) for state, hstep, probs in game_history: queue.append((state, hstep, probs, result)) if dequeuef else\ queue.put((state, hstep, probs, result)) if hstep!=1: result = -result elif best_idx>=0: gh = [] for (action, probs) in game_history: prar = [] for idx, prob in enumerate(probs): if prob>0: prar.append([idx, prob]) gh.append((action, prar)) js = {"netIdx":best_idx, "result":net1_result, "username":username, "action":gh} hr = webFunction.http_request(url, True, json.dumps(js)) if hr == None: sys.exit() elif hr['status'] == 'error': print('error occured') else: print("game is uploaded") return net1_result, step if net1_result!=None else 0
def play_game1(net1, net2, steps_before_tau_0, mcts_searches, mcts_batch_size, device="cpu"): """ Play one single game, memorizing transitions into the replay buffer :param net1: player1 :param net2: player2 :return: value for the game in respect to player1 (+1 if p1 won, -1 if lost, 0 if draw) """ # assert isinstance(replay_queue, (Queue, type(None))) assert isinstance(net1, Net) assert isinstance(net2, Net) assert isinstance(steps_before_tau_0, int) and steps_before_tau_0 >= 0 mcts_stores = [mcts.MCTS(), mcts.MCTS()] state = game_c.INITIAL_STATE nets = [net1, net2] cur_player = game.PLAYER_BLACK step = 0 tau = 1 if steps_before_tau_0 > 0 else 0 game_history = [] result = None net1_result = None n_trees = [[], []] pass_count = 0 while result is None: mcts_stores[cur_player].search_batch(mcts_searches[cur_player], mcts_batch_size[cur_player], state, cur_player, nets[cur_player], device=device) n_trees[cur_player].append(len(mcts_stores[cur_player])) probs, _ = mcts_stores[cur_player].get_policy_value(state, tau=tau) game_history.append((state, cur_player, probs)) action = np.random.choice(game.BOARD_SIZE**2 + 1, p=probs) del probs if not game_c.is_possible_move(state, action): # in this case, game.move function raise AssertionError print("Impossible action selected") state, field = game_c.move(state, action) mcts_stores[0].clear_subtrees(state) mcts_stores[1].clear_subtrees(state) if action == game.BOARD_SIZE**2: pass_count += 1 else: pass_count = 0 if pass_count == 2 or (field != 2).all(): result = game_c.calc_result_f(field, cur_player) net1_result = result if cur_player == 0 else -result cur_player = 1 - cur_player step += 1 if step >= steps_before_tau_0: tau = 0 game_history.clear() mcts_stores[0].clear() mcts_stores[1].clear() del game_history del nets mcts_stores.clear() del mcts_stores return net1_result, step, n_trees
def play_game(mcts_stores, replay_queue, probs_queue, net1, net2, steps_before_tau_0, mcts_searches, mcts_batch_size, device="cpu", status=""): """ Play one single game, memorizing transitions into the replay buffer :param mcts_stores: could be None or single MCTS or two MCTSes for individual net :param replay_buffer: queue with (state, probs, values), if None, nothing is stored :param net1: player1 :param net2: player2 :return: value for the game in respect to player1 (+1 if p1 won, -1 if lost, 0 if draw) """ # assert isinstance(replay_queue, (Queue, type(None))) assert isinstance(mcts_stores, (mcts.MCTS, type(None), list)) assert isinstance(net1, Net) assert isinstance(net2, Net) assert isinstance(steps_before_tau_0, int) and steps_before_tau_0 >= 0 assert isinstance(mcts_searches, int) and mcts_searches > 0 assert isinstance(mcts_batch_size, int) and mcts_batch_size > 0 if mcts_stores is None: mcts_stores = [mcts.MCTS(), mcts.MCTS()] elif isinstance(mcts_stores, mcts.MCTS): mcts_stores = [mcts_stores, mcts_stores] state = game_c.INITIAL_STATE nets = [net1, net2] cur_player = game.PLAYER_BLACK step = 0 tau = 1 if steps_before_tau_0 > 0 else 0 game_history = [] result = None net1_result = None pass_count = 0 while result is None: t = time.perf_counter() mcts_stores[cur_player].search_batch(mcts_searches, mcts_batch_size, state, cur_player, nets[cur_player], device=device) probs, _ = mcts_stores[cur_player].get_policy_value(state, tau=tau) game_history.append((state, cur_player, probs)) action = np.random.choice(game.BOARD_SIZE**2 + 1, p=probs) del probs if not game_c.is_possible_move(state, action): # in this case, game.move function raise AssertionError print("Impossible action selected") state, field = game_c.move(state, action) mcts_stores[0].clear_subtrees(state) mcts_stores[1].clear_subtrees(state) if action == game.BOARD_SIZE**2: # print("pass: player{}, #{}".format(cur_player, step + 1)) # render(state) pass_count += 1 else: pass_count = 0 # render(state) # print(status) # print("{:.2f} [s/move]".format(time.perf_counter() - t)) if pass_count == 2 or (field != 2).all() or \ (field != 0).all() or (field != 1).all(): result = game_c.calc_result_f(field, cur_player) net1_result = result if cur_player == 0 else -result cur_player = 1 - cur_player step += 1 if step >= steps_before_tau_0: tau = 0 if replay_queue is not None: for state, cur_player, probs in reversed(game_history): # replay_queue.put((state, cur_player, result)) # probs_queue.put(probs) augment_data = game.augment_data(state, probs) for arg_state, arg_probs in zip(*augment_data): replay_queue.put((arg_state, cur_player, result)) probs_queue.put(arg_probs) augment_data[0].clear() augment_data[1].clear() del state, cur_player, probs result = -0.95 * result game_history.clear() mcts_stores[0].clear() mcts_stores[1].clear() del game_history del nets return net1_result, step