Пример #1
0
    global_step = 0
    scores, episodes = [], []

    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, player1.state_size])

        while not done:
            # env 초기화
            global_step += 1

            # 홀수 턴(qlearning player) - Black
            if env.get_turn() % 2 == 1:
                # 현재 상태에 대한 행동 선택
                action = player1.get_action(state)
                # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
                next_state, reward, done = env.step(BLACK, action)
                '''
				print("Action : ", action)
				print("Reward : ", reward)
				print("Next State : ", next_state)
				print()
				'''
                next_state = np.reshape(next_state, [1, player1.state_size])
                next_action = player1.get_action(next_state)
                # 샘플로 모델 학습
                player1.train_model(state, action, reward, next_state,
                                    next_action, done)
Пример #2
0
                next_state, reward, done = env.step(BLACK, action)
                next_state = np.reshape(next_state, [1, player.state_size])
                if PRINT_FLAG:
                    print("Action : {0} ==> {1}, {2}".format(
                        action, int(action / 10), action % 10))
                    print("Reward : ", reward)
                    print("Next State : ", next_state)
                    print()
                player.append_sample(state, action, reward)
                score += reward
                state = copy.deepcopy(next_state)

                if PRINT_FLAG:
                    # board 출력
                    print("Episode : {0}, Turn : {1}, PLAYER1".format(
                        e, env.get_turn()))
                    env.draw_board()
                    print()
                    time.sleep(1)
            # White
            else:
                # 현재 상태 획득
                state = env.get_state()
                state = np.reshape(state, [1, player.state_size])
                # 현재 상태에 대한 행동 선택
                action = player.get_action(state)
                # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
                next_state, reward, done = env.step(BLACK, action)
                next_state = np.reshape(next_state, [1, player.state_size])
                if PRINT_FLAG:
                    print("Action : {0} ==> {1}, {2}".format(