Пример #1
0
def evaluate(policy_net):
    total_rewards = []
    win_loss = []
    for e in range(env.config.EVAL_EPISODE):
        # Initialize the environment and state
        env.reset()
        input_stack.__init__(env)
        prev_hard_coded_a = 1  # players init to up
        print('Starting episode:', e)

        while True:
            # Select and perform an action
            action = test_select_action(policy_net, input_stack, env)

            if env.config.load_opponent is not None:
                hard_coded_a = test_select_action(player2_net, input_stack, env, is_opponent=True).item()
            else:
                hard_coded_a = hard_coded_policy(env.observation, np.argwhere(env.head_board==2)[0], prev_hard_coded_a, env.config.board_shape,  env.action_space, eps=env.config.hcp_eps)
                prev_hard_coded_a = hard_coded_a

            next_observation, reward, done, dictionary = env.step([action.item(), hard_coded_a])

            env.render()

            input_stack.update(env)

            if done:
                # utils.show_board(next_observation, dictionary['head_board'], env.config.cmap, delay=env.config.delay, filename='tmp.png')
                player_reward = reward[0]
                win_loss.append(player_reward > 0)
                break
        total_rewards.append(player_reward)


    stats = [np.mean(total_rewards), np.std(total_rewards), np.sum(win_loss), len(win_loss)-np.sum(win_loss)]

    return stats
Пример #2
0
def evaluate(policy_net):
    total_rewards = []
    win_loss = []
    for e in range(env.config.EVAL_EPISODE
                   ):  #probably put number of episodes in conifg
        # Initialize the environment and state
        env.reset()
        input_stack.__init__(env)
        prev_a_3 = 1  # players init to up
        prev_a_4 = 1  # players init to up
        print('Starting episode:', e)

        while True:
            # Select and perform an action
            action = test_select_action(policy_net, input_stack, env, 1, 2)
            a_1 = np.floor_divide(action.item(), env.action_space.n)
            a_2 = action.item() % env.action_space.n

            # print('column', a % 4)
            # print('row', np.floor_divide(a, 4))
            # print(input_stack.input_stack[0:2,10:30,10:30])

            if env.config.load_opponent is not None:
                opponent_action = test_select_action(opponent_net, input_stack,
                                                     env, 3, 4)
                a_3 = np.floor_divide(opponent_action.item(),
                                      env.action_space.n)
                a_4 = opponent_action.item() % env.action_space.n
            else:
                a_3 = hard_coded_policy(env.observation,
                                        np.argwhere(env.head_board == 3)[0],
                                        prev_a_3,
                                        env.config.board_shape,
                                        env.action_space,
                                        eps=env.config.hcp_eps)
                a_4 = hard_coded_policy(env.observation,
                                        np.argwhere(env.head_board == 4)[0],
                                        prev_a_4,
                                        env.config.board_shape,
                                        env.action_space,
                                        eps=env.config.hcp_eps)
                prev_a_3 = a_3
                prev_a_4 = a_4

            next_observation, reward, done, dictionary = env.step(
                [a_1, a_2, a_3, a_4])

            if env.config.show:
                env.render()
            # print(next_observation)
            # print(a_2)

            input_stack.update(env)

            if done:
                # utils.show_board(next_observation, dictionary['head_board'], env.config.cmap, delay=env.config.delay, filename='tmp.png')
                player_reward = reward[0]
                win_loss.append(player_reward > 0)
                break
        total_rewards.append(player_reward)

    stats = [
        np.mean(total_rewards),
        np.std(total_rewards),
        np.sum(win_loss),
        len(win_loss) - np.sum(win_loss)
    ]

    return stats
Пример #3
0
        while True:
            # Select and perform an action
            action = test_select_action(policy_net, input_stack, env, 1, 2)
            a_1 = np.floor_divide(action.item(), env.action_space.n)
            a_2 = action.item() % env.action_space.n

            if env.config.load_opponent is not None:
                opponent_action = test_select_action(opponent_net, input_stack,
                                                     env, 3, 4)
                a_3 = np.floor_divide(opponent_action.item(),
                                      env.action_space.n)
                a_4 = opponent_action.item() % env.action_space.n
            else:
                a_3 = hard_coded_policy(env.observation,
                                        np.argwhere(env.head_board == 3)[0],
                                        prev_a_3,
                                        env.config.board_shape,
                                        env.action_space,
                                        eps=env.config.hcp_eps)
                a_4 = hard_coded_policy(env.observation,
                                        np.argwhere(env.head_board == 4)[0],
                                        prev_a_4,
                                        env.config.board_shape,
                                        env.action_space,
                                        eps=env.config.hcp_eps)
                prev_a_3 = a_3
                prev_a_4 = a_4

            next_observation, reward, done, dictionary = env.step(
                [a_1, a_2, a_3, a_4])

            reward = torch.tensor([reward], device=device)
Пример #4
0
def evaluate(policy_net_1,
             policy_net_2,
             opponent_net1=None,
             opponent_net2=None):
    player_1_rewards = []
    player_2_rewards = []
    team_rewards = []
    player_1_win = []
    player_2_win = []
    team_win = []

    for e in range(env.config.EVAL_EPISODE):
        # Initialize the environment and state
        env.reset()
        input_stack.__init__(env)
        prev_hard_coded_a = 1  # players init to up
        prev_hard_coded_b = 1  # players init to up
        print('Starting episode:', e)
        while True:
            # Select and perform an action
            action_1 = test_select_action(policy_net_1,
                                          input_stack,
                                          env,
                                          player_num=1)
            action_2 = test_select_action(policy_net_2,
                                          input_stack,
                                          env,
                                          player_num=2)

            if env.config.load_opponent is not None:
                hard_coded_a = test_select_action(opponent_net1,
                                                  input_stack,
                                                  env,
                                                  3,
                                                  is_opponent=True).item()
                hard_coded_b = test_select_action(opponent_net2,
                                                  input_stack,
                                                  env,
                                                  4,
                                                  is_opponent=True).item()
            else:
                hard_coded_a = hard_coded_policy(
                    env.observation,
                    np.argwhere(env.head_board == 3)[0],
                    prev_hard_coded_a,
                    env.config.board_shape,
                    env.action_space,
                    eps=env.config.hcp_eps)
                hard_coded_b = hard_coded_policy(
                    env.observation,
                    np.argwhere(env.head_board == 4)[0],
                    prev_hard_coded_b,
                    env.config.board_shape,
                    env.action_space,
                    eps=env.config.hcp_eps)
                prev_hard_coded_a = hard_coded_a
                prev_hard_coded_b = hard_coded_b

            next_observation, reward, done, dictionary = env.step(
                [action_1.item(),
                 action_2.item(), hard_coded_a, hard_coded_b])

            input_stack.update(env)

            if done:
                player_1_rewards.append(reward[0])
                player_2_rewards.append(reward[1])
                team_rewards.append(reward[0] + reward[1])
                player_1_win.append(reward[0] > 0)
                player_2_win.append(reward[1] > 0)
                team_win.append((reward[0] > 0) or (reward[1] > 0))
                utils.show_board(next_observation,
                                 dictionary['head_board'],
                                 env.config.cmap,
                                 delay=env.config.delay,
                                 filename='tmp.png')
                break

            env.render()

    stats = [
        np.mean(player_1_rewards),
        np.std(player_1_rewards),
        np.mean(player_2_rewards),
        np.std(player_2_rewards),
        np.mean(team_rewards),
        np.std(team_rewards),
        np.sum(player_1_win),
        np.sum(player_2_win),
        np.sum(team_win)
    ]

    return stats
Пример #5
0
def evaluate_hard():
    player_1_rewards = []
    player_2_rewards = []
    team_rewards = []
    player_1_win = []
    player_2_win = []
    team_win = []

    for e in range(1000):
        # Initialize the environment and state
        env.reset()
        input_stack.__init__(env)
        prev_action_1 = 1
        prev_action_2 = 1
        prev_hard_coded_a = 1  # players init to up
        prev_hard_coded_b = 1  # players init to up
        print('Starting episode:', e)
        while True:
            # Select and perform an action
            action_1 = hard_coded_policy(env.observation,
                                         np.argwhere(env.head_board == 1)[0],
                                         prev_action_1,
                                         env.config.board_shape,
                                         env.action_space,
                                         eps=env.config.hcp_eps)
            action_2 = hard_coded_policy(env.observation,
                                         np.argwhere(env.head_board == 2)[0],
                                         prev_action_2,
                                         env.config.board_shape,
                                         env.action_space,
                                         eps=env.config.hcp_eps)
            hard_coded_a = hard_coded_policy(
                env.observation,
                np.argwhere(env.head_board == 3)[0],
                prev_hard_coded_a,
                env.config.board_shape,
                env.action_space,
                eps=env.config.hcp_eps)
            hard_coded_b = hard_coded_policy(
                env.observation,
                np.argwhere(env.head_board == 4)[0],
                prev_hard_coded_b,
                env.config.board_shape,
                env.action_space,
                eps=env.config.hcp_eps)

            prev_action_1 = action_1
            prev_action_2 = action_2
            prev_hard_coded_a = hard_coded_a
            prev_hard_coded_b = hard_coded_b
            next_observation, reward, done, dictionary = env.step(
                [action_1, action_2, hard_coded_a, hard_coded_b])

            if done:
                player_1_rewards.append(reward[0])
                player_2_rewards.append(reward[1])
                team_rewards.append(reward[0] + reward[1])
                player_1_win.append(reward[0] > 0)
                player_2_win.append(reward[1] > 0)
                team_win.append((reward[0] > 0) or (reward[1] > 0))
                break

            env.render()

    stats = [
        np.mean(player_1_rewards),
        np.std(player_1_rewards),
        np.mean(player_2_rewards),
        np.std(player_2_rewards),
        np.mean(team_rewards),
        np.std(team_rewards),
        np.sum(player_1_win),
        np.sum(player_2_win),
        np.sum(team_win)
    ]

    return stats
Пример #6
0
win_loss = []

env = EnvSolo()

for e in range(1000):  #probably put number of episodes in conifg
    # Initialize the environment and state
    env.reset()
    prev_action = 1
    prev_hard_coded_a = 1  # players init to up
    print('Starting episode:', e)

    while True:
        # Select and perform an action
        action = hard_coded_policy(env.observation,
                                   np.argwhere(env.head_board == 1)[0],
                                   prev_action,
                                   env.config.board_shape,
                                   env.action_space,
                                   eps=env.config.hcp_eps)
        prev_action = action

        hard_coded_a = hard_coded_policy(env.observation,
                                         np.argwhere(env.head_board == 2)[0],
                                         prev_hard_coded_a,
                                         env.config.board_shape,
                                         env.action_space,
                                         eps=env.config.hcp_eps)
        prev_hard_coded_a = hard_coded_a

        next_observation, reward, done, dictionary = env.step(
            [action, hard_coded_a])
Пример #7
0
def evaluate(policy_net_1, policy_net_2):
    player_1_rewards = []
    player_2_rewards = []
    team_rewards = []
    player_1_win = []
    player_2_win = []
    team_win = []

    for e in range(env.config.EVAL_EPISODE):
        # Initialize the environment and state
        env.reset()
        input_stack.__init__(env)
        prev_hard_coded_a = 1  # players init to up
        prev_hard_coded_b = 1  # players init to up
        print('Starting episode:', e)
        while True:
            # Select and perform an action
            action_1 = test_select_action(policy_net_1,
                                          input_stack,
                                          env,
                                          player_num=1)
            action_2 = test_select_action(policy_net_2,
                                          input_stack,
                                          env,
                                          player_num=2)
            hard_coded_a = hard_coded_policy(
                env.observation,
                np.argwhere(env.head_board == 3)[0],
                prev_hard_coded_a,
                env.config.board_shape,
                env.action_space,
                eps=env.config.hcp_eps)
            hard_coded_b = hard_coded_policy(
                env.observation,
                np.argwhere(env.head_board == 4)[0],
                prev_hard_coded_b,
                env.config.board_shape,
                env.action_space,
                eps=env.config.hcp_eps)

            prev_hard_coded_a = hard_coded_a
            prev_hard_coded_b = hard_coded_b
            next_observation, reward, done, dictionary = env.step(
                [action_1.item(),
                 action_2.item(), hard_coded_a, hard_coded_b])

            input_stack.update(env)

            if done:
                player_1_rewards.append(reward[0])
                player_2_rewards.append(reward[1])
                team_rewards.append(reward[0] + reward[1])
                player_1_win.append(reward[0] > 0)
                player_2_win.append(reward[1] > 0)
                team_win.append((reward[0] > 0) or (reward[1] > 0))
                break

    stats = [
        np.mean(player_1_rewards),
        np.std(player_1_rewards),
        np.mean(player_2_rewards),
        np.std(player_2_rewards),
        np.mean(team_rewards),
        np.std(team_rewards),
        np.sum(player_1_win),
        np.sum(player_2_win),
        np.sum(team_win)
    ]

    return stats