def selfplay(): """ legacy function for trying to implement self-play reinforcement learning like alpha-zero Go """ agent2 = Agent(0.99, 0.1, 0.003, 42, train_games, 7, eps_dec) agent2.load_checkpoint() global win_cntr global done g = Game() turn = random.choice([PLAYER, AI]) done = False transitions_agent = [] transitions_agent2 = [] while done == False: g.printBoard() if turn == PLAYER: # row = input('{}\'s turn: '.format('Red')) # g.insert(int(row), turn) observation = [] for sublist in g.board: for i in sublist: observation.append(i) observation = np.asarray(observation) action = agent2.choose_action(observation) if g.check_if_action_valid(action): print('{}\'s turn: %d'.format('Red') % action) g.insert(action, PLAYER_PIECE) else: while g.check_if_action_valid(action) == False: agent.store_transition(observation, action, -100, observation, done) action = np.random.randint(7) print('{}\'s turn: %d'.format('Red') % action) g.insert(action, PLAYER_PIECE) observation_ = [] for sublist in g.board: for i in sublist: observation_.append(i) observation_ = np.asarray(observation_) transitions_agent2 += [(observation, action, observation_, done)] else: observation = [] for sublist in g.board: for i in sublist: observation.append(i) observation = np.asarray(observation) action = agent.choose_action(observation) if g.check_if_action_valid(action): print('{}\'s turn: %d'.format('Yellow') % action) g.insert(action, AI_PIECE) else: while g.check_if_action_valid(action) == False: agent.store_transition(observation, action, -100, observation, done) action = np.random.randint(7) print('{}\'s turn: %d'.format('Yellow') % action) g.insert(action, AI_PIECE) observation_ = [] for sublist in g.board: for i in sublist: observation_.append(i) observation_ = np.asarray(observation_) transitions_agent += [(observation, action, observation_, done)] turn = AI if turn == PLAYER else PLAYER if g.getWinner() == Tie: reward_agent = 0 else: winner = AI if turn == PLAYER else PLAYER if winner == AI: win_cntr += 1 if vertical_win: reward_agent = 5 else: reward_agent = 20 else: reward_agent = -20 for i in range(len(transitions_agent)): agent.store_transition(transitions_agent[i][0], transitions_agent[i][1], reward_agent, transitions_agent[i][2], transitions_agent[i][3]) agent.learn() return
transitions_agent[i][2], transitions_agent[i][3]) agent.learn() return # Initialize hyperparameters and metrics train_games = 1 minimax_dep = 3 epsilon = 0 #because play with trained data - it will use min_eps with 0.01 eps_dec = 0.9999 #doesn't matter with epsilon = 0 print_episode = 100 agent = Agent(0.99, epsilon, 0.01, 42, 64, 7, eps_dec) agent.load_checkpoint() lose_cntr = 0 win_cntr = 0 win_dif = 0 for i in range(train_games): if i == 0: print(f'Start simulation for minimax with depth {minimax_dep}...') elif (i) % print_episode == 0: win_dif = win_cntr - win_dif print( f"Episode {i} trained successfully | Games won: {int(win_cntr)} | Games won in last {print_episode} episodes: {round((win_dif * 100/ print_episode), 2)}% | Decay: {round((epsilon * eps_dec ** i) *100 if (epsilon * eps_dec ** i) > 0.01 else 0.01, 2)}%" ) win_dif = win_cntr play_minimax(minimax_dep)