def selfplay():
    """
        legacy function for trying to implement self-play reinforcement learning like alpha-zero Go
    """
    agent2 = Agent(0.99, 0.1, 0.003, 42, train_games, 7, eps_dec)
    agent2.load_checkpoint()
    global win_cntr
    global done
    g = Game()
    turn = random.choice([PLAYER, AI])
    done = False
    transitions_agent = []
    transitions_agent2 = []
    while done == False:
        g.printBoard()
        if turn == PLAYER:
            # row = input('{}\'s turn: '.format('Red'))
            # g.insert(int(row), turn)
            observation = []
            for sublist in g.board:
                for i in sublist:
                    observation.append(i)
            observation = np.asarray(observation)
            action = agent2.choose_action(observation)
            if g.check_if_action_valid(action):
                print('{}\'s turn: %d'.format('Red') % action)
                g.insert(action, PLAYER_PIECE)
            else:
                while g.check_if_action_valid(action) == False:
                    agent.store_transition(observation, action, -100,
                                           observation, done)
                    action = np.random.randint(7)
                print('{}\'s turn: %d'.format('Red') % action)
                g.insert(action, PLAYER_PIECE)
            observation_ = []
            for sublist in g.board:
                for i in sublist:
                    observation_.append(i)
            observation_ = np.asarray(observation_)
            transitions_agent2 += [(observation, action, observation_, done)]
        else:
            observation = []
            for sublist in g.board:
                for i in sublist:
                    observation.append(i)
            observation = np.asarray(observation)
            action = agent.choose_action(observation)
            if g.check_if_action_valid(action):
                print('{}\'s turn: %d'.format('Yellow') % action)
                g.insert(action, AI_PIECE)
            else:
                while g.check_if_action_valid(action) == False:
                    agent.store_transition(observation, action, -100,
                                           observation, done)
                    action = np.random.randint(7)
                print('{}\'s turn: %d'.format('Yellow') % action)
                g.insert(action, AI_PIECE)
            observation_ = []
            for sublist in g.board:
                for i in sublist:
                    observation_.append(i)
            observation_ = np.asarray(observation_)
            transitions_agent += [(observation, action, observation_, done)]
        turn = AI if turn == PLAYER else PLAYER
    if g.getWinner() == Tie:
        reward_agent = 0
    else:
        winner = AI if turn == PLAYER else PLAYER
        if winner == AI:
            win_cntr += 1
            if vertical_win:
                reward_agent = 5
            else:
                reward_agent = 20

        else:
            reward_agent = -20

    for i in range(len(transitions_agent)):
        agent.store_transition(transitions_agent[i][0],
                               transitions_agent[i][1], reward_agent,
                               transitions_agent[i][2],
                               transitions_agent[i][3])
    agent.learn()
    return
                               transitions_agent[i][2],
                               transitions_agent[i][3])
    agent.learn()
    return


# Initialize hyperparameters and metrics

train_games = 1
minimax_dep = 3
epsilon = 0  #because play with trained data - it will use min_eps with 0.01
eps_dec = 0.9999  #doesn't matter with epsilon = 0
print_episode = 100

agent = Agent(0.99, epsilon, 0.01, 42, 64, 7, eps_dec)
agent.load_checkpoint()

lose_cntr = 0
win_cntr = 0
win_dif = 0

for i in range(train_games):
    if i == 0:
        print(f'Start simulation for minimax with depth {minimax_dep}...')
    elif (i) % print_episode == 0:
        win_dif = win_cntr - win_dif
        print(
            f"Episode {i} trained successfully | Games won: {int(win_cntr)} | Games won in last {print_episode} episodes: {round((win_dif * 100/ print_episode), 2)}% | Decay: {round((epsilon * eps_dec ** i) *100 if (epsilon * eps_dec ** i) > 0.01 else 0.01, 2)}%"
        )
        win_dif = win_cntr
    play_minimax(minimax_dep)