def evaluate(policy_net): total_rewards = [] win_loss = [] for e in range(env.config.EVAL_EPISODE): # Initialize the environment and state env.reset() input_stack.__init__(env) prev_hard_coded_a = 1 # players init to up print('Starting episode:', e) while True: # Select and perform an action action = test_select_action(policy_net, input_stack, env) if env.config.load_opponent is not None: hard_coded_a = test_select_action(player2_net, input_stack, env, is_opponent=True).item() else: hard_coded_a = hard_coded_policy(env.observation, np.argwhere(env.head_board==2)[0], prev_hard_coded_a, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_hard_coded_a = hard_coded_a next_observation, reward, done, dictionary = env.step([action.item(), hard_coded_a]) env.render() input_stack.update(env) if done: # utils.show_board(next_observation, dictionary['head_board'], env.config.cmap, delay=env.config.delay, filename='tmp.png') player_reward = reward[0] win_loss.append(player_reward > 0) break total_rewards.append(player_reward) stats = [np.mean(total_rewards), np.std(total_rewards), np.sum(win_loss), len(win_loss)-np.sum(win_loss)] return stats
def evaluate(policy_net): total_rewards = [] win_loss = [] for e in range(env.config.EVAL_EPISODE ): #probably put number of episodes in conifg # Initialize the environment and state env.reset() input_stack.__init__(env) prev_a_3 = 1 # players init to up prev_a_4 = 1 # players init to up print('Starting episode:', e) while True: # Select and perform an action action = test_select_action(policy_net, input_stack, env, 1, 2) a_1 = np.floor_divide(action.item(), env.action_space.n) a_2 = action.item() % env.action_space.n # print('column', a % 4) # print('row', np.floor_divide(a, 4)) # print(input_stack.input_stack[0:2,10:30,10:30]) if env.config.load_opponent is not None: opponent_action = test_select_action(opponent_net, input_stack, env, 3, 4) a_3 = np.floor_divide(opponent_action.item(), env.action_space.n) a_4 = opponent_action.item() % env.action_space.n else: a_3 = hard_coded_policy(env.observation, np.argwhere(env.head_board == 3)[0], prev_a_3, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) a_4 = hard_coded_policy(env.observation, np.argwhere(env.head_board == 4)[0], prev_a_4, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_a_3 = a_3 prev_a_4 = a_4 next_observation, reward, done, dictionary = env.step( [a_1, a_2, a_3, a_4]) if env.config.show: env.render() # print(next_observation) # print(a_2) input_stack.update(env) if done: # utils.show_board(next_observation, dictionary['head_board'], env.config.cmap, delay=env.config.delay, filename='tmp.png') player_reward = reward[0] win_loss.append(player_reward > 0) break total_rewards.append(player_reward) stats = [ np.mean(total_rewards), np.std(total_rewards), np.sum(win_loss), len(win_loss) - np.sum(win_loss) ] return stats
while True: # Select and perform an action action = test_select_action(policy_net, input_stack, env, 1, 2) a_1 = np.floor_divide(action.item(), env.action_space.n) a_2 = action.item() % env.action_space.n if env.config.load_opponent is not None: opponent_action = test_select_action(opponent_net, input_stack, env, 3, 4) a_3 = np.floor_divide(opponent_action.item(), env.action_space.n) a_4 = opponent_action.item() % env.action_space.n else: a_3 = hard_coded_policy(env.observation, np.argwhere(env.head_board == 3)[0], prev_a_3, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) a_4 = hard_coded_policy(env.observation, np.argwhere(env.head_board == 4)[0], prev_a_4, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_a_3 = a_3 prev_a_4 = a_4 next_observation, reward, done, dictionary = env.step( [a_1, a_2, a_3, a_4]) reward = torch.tensor([reward], device=device)
def evaluate(policy_net_1, policy_net_2, opponent_net1=None, opponent_net2=None): player_1_rewards = [] player_2_rewards = [] team_rewards = [] player_1_win = [] player_2_win = [] team_win = [] for e in range(env.config.EVAL_EPISODE): # Initialize the environment and state env.reset() input_stack.__init__(env) prev_hard_coded_a = 1 # players init to up prev_hard_coded_b = 1 # players init to up print('Starting episode:', e) while True: # Select and perform an action action_1 = test_select_action(policy_net_1, input_stack, env, player_num=1) action_2 = test_select_action(policy_net_2, input_stack, env, player_num=2) if env.config.load_opponent is not None: hard_coded_a = test_select_action(opponent_net1, input_stack, env, 3, is_opponent=True).item() hard_coded_b = test_select_action(opponent_net2, input_stack, env, 4, is_opponent=True).item() else: hard_coded_a = hard_coded_policy( env.observation, np.argwhere(env.head_board == 3)[0], prev_hard_coded_a, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) hard_coded_b = hard_coded_policy( env.observation, np.argwhere(env.head_board == 4)[0], prev_hard_coded_b, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_hard_coded_a = hard_coded_a prev_hard_coded_b = hard_coded_b next_observation, reward, done, dictionary = env.step( [action_1.item(), action_2.item(), hard_coded_a, hard_coded_b]) input_stack.update(env) if done: player_1_rewards.append(reward[0]) player_2_rewards.append(reward[1]) team_rewards.append(reward[0] + reward[1]) player_1_win.append(reward[0] > 0) player_2_win.append(reward[1] > 0) team_win.append((reward[0] > 0) or (reward[1] > 0)) utils.show_board(next_observation, dictionary['head_board'], env.config.cmap, delay=env.config.delay, filename='tmp.png') break env.render() stats = [ np.mean(player_1_rewards), np.std(player_1_rewards), np.mean(player_2_rewards), np.std(player_2_rewards), np.mean(team_rewards), np.std(team_rewards), np.sum(player_1_win), np.sum(player_2_win), np.sum(team_win) ] return stats
def evaluate_hard(): player_1_rewards = [] player_2_rewards = [] team_rewards = [] player_1_win = [] player_2_win = [] team_win = [] for e in range(1000): # Initialize the environment and state env.reset() input_stack.__init__(env) prev_action_1 = 1 prev_action_2 = 1 prev_hard_coded_a = 1 # players init to up prev_hard_coded_b = 1 # players init to up print('Starting episode:', e) while True: # Select and perform an action action_1 = hard_coded_policy(env.observation, np.argwhere(env.head_board == 1)[0], prev_action_1, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) action_2 = hard_coded_policy(env.observation, np.argwhere(env.head_board == 2)[0], prev_action_2, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) hard_coded_a = hard_coded_policy( env.observation, np.argwhere(env.head_board == 3)[0], prev_hard_coded_a, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) hard_coded_b = hard_coded_policy( env.observation, np.argwhere(env.head_board == 4)[0], prev_hard_coded_b, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_action_1 = action_1 prev_action_2 = action_2 prev_hard_coded_a = hard_coded_a prev_hard_coded_b = hard_coded_b next_observation, reward, done, dictionary = env.step( [action_1, action_2, hard_coded_a, hard_coded_b]) if done: player_1_rewards.append(reward[0]) player_2_rewards.append(reward[1]) team_rewards.append(reward[0] + reward[1]) player_1_win.append(reward[0] > 0) player_2_win.append(reward[1] > 0) team_win.append((reward[0] > 0) or (reward[1] > 0)) break env.render() stats = [ np.mean(player_1_rewards), np.std(player_1_rewards), np.mean(player_2_rewards), np.std(player_2_rewards), np.mean(team_rewards), np.std(team_rewards), np.sum(player_1_win), np.sum(player_2_win), np.sum(team_win) ] return stats
win_loss = [] env = EnvSolo() for e in range(1000): #probably put number of episodes in conifg # Initialize the environment and state env.reset() prev_action = 1 prev_hard_coded_a = 1 # players init to up print('Starting episode:', e) while True: # Select and perform an action action = hard_coded_policy(env.observation, np.argwhere(env.head_board == 1)[0], prev_action, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_action = action hard_coded_a = hard_coded_policy(env.observation, np.argwhere(env.head_board == 2)[0], prev_hard_coded_a, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_hard_coded_a = hard_coded_a next_observation, reward, done, dictionary = env.step( [action, hard_coded_a])
def evaluate(policy_net_1, policy_net_2): player_1_rewards = [] player_2_rewards = [] team_rewards = [] player_1_win = [] player_2_win = [] team_win = [] for e in range(env.config.EVAL_EPISODE): # Initialize the environment and state env.reset() input_stack.__init__(env) prev_hard_coded_a = 1 # players init to up prev_hard_coded_b = 1 # players init to up print('Starting episode:', e) while True: # Select and perform an action action_1 = test_select_action(policy_net_1, input_stack, env, player_num=1) action_2 = test_select_action(policy_net_2, input_stack, env, player_num=2) hard_coded_a = hard_coded_policy( env.observation, np.argwhere(env.head_board == 3)[0], prev_hard_coded_a, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) hard_coded_b = hard_coded_policy( env.observation, np.argwhere(env.head_board == 4)[0], prev_hard_coded_b, env.config.board_shape, env.action_space, eps=env.config.hcp_eps) prev_hard_coded_a = hard_coded_a prev_hard_coded_b = hard_coded_b next_observation, reward, done, dictionary = env.step( [action_1.item(), action_2.item(), hard_coded_a, hard_coded_b]) input_stack.update(env) if done: player_1_rewards.append(reward[0]) player_2_rewards.append(reward[1]) team_rewards.append(reward[0] + reward[1]) player_1_win.append(reward[0] > 0) player_2_win.append(reward[1] > 0) team_win.append((reward[0] > 0) or (reward[1] > 0)) break stats = [ np.mean(player_1_rewards), np.std(player_1_rewards), np.mean(player_2_rewards), np.std(player_2_rewards), np.mean(team_rewards), np.std(team_rewards), np.sum(player_1_win), np.sum(player_2_win), np.sum(team_win) ] return stats