def play(model): # 学習データ history = [] # 状態の生成 state = State() while True: # ゲーム終了時 if state.is_done(): break # 合法手の確率分布の取得 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 学習データに状態と方策を追加 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) # 行動の取得 action = np.random.choice(state.legal_actions(), p=scores) # 次の状態の取得 state = state.next(action) # 学習データに価値を追加 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): history = [] state = State() while True: if state.is_done(): break scores = pv_mcts_scores(model, state, SP_TEMPERATURE) policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) action = np.random.choice(state.legal_actions(), p=scores) state = state.next(action) value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history
def play(model): # 학습 데이터 history = [] # 상태 생성 state = State() while True: # 게임 종료 시 if state.is_done(): break # 합법적인 수의 확률 분포 얻기 scores = pv_mcts_scores(model, state, SP_TEMPERATURE) # 학습 데이터에 상태와 정책 추가 policies = [0] * DN_OUTPUT_SIZE for action, policy in zip(state.legal_actions(), scores): policies[action] = policy history.append([state.pieces_array(), policies, None]) # 행동 얻기 action = np.random.choice(state.legal_actions(), p=scores) # 다음 상태 얻기 state = state.next(action) # 학습 데이터에 가치 추가 value = first_player_value(state) for i in range(len(history)): history[i][2] = value value = -value return history