def min_max_alpha_beta(game, turn, max_depth, alpha=-sys.float_info.max, beta=sys.float_info.max): best_score_move = None game.available_moves() moves = game.moves if not moves: return 0, None for move in moves: new_game = Checkers() new_game.board_state = game.board_state new_game.turn = game.turn new_game.moves_queen_with_out_capture = game.moves_queen_with_out_capture new_game.move(move) winner = new_game.win if winner != 0: return winner*10000, move else: if max_depth <= 1: score = evaluate(new_game) else: score, _ = min_max_alpha_beta(new_game, -turn, max_depth-1, alpha, beta) if turn > 0: if score > alpha: alpha = score best_score_move = move else: if score < beta: beta = score best_score_move = move if alpha >= beta: break return alpha if turn > 0 else beta, best_score_move
session.run(tf.global_variables_initializer()) GAME.reset() wins = 0 for i in range(MIN_EXPERIENCES): GAME.available_moves() if GAME.win != 0: GAME.reset() move = random_play(GAME) action = encoding_move(move) GAME.move(move) if GAME.win == 0: new_GAME = Checkers() new_GAME.board_state = np.array(GAME.board_state) new_GAME.turn = GAME.turn new_GAME.moves_queen_with_out_capture = GAME.moves_queen_with_out_capture move = min_max.min_max_player(new_GAME, new_GAME.turn) GAME.move(move) reward = GAME.win experience_replay_buffer.add_experince(action, GAME.board_state, reward) t0 = datetime.now() for i in range(num_episodes): total_t, episode_reward, duration, num_steps_in_episode, time_per_step, epsilon = play_one( total_t, experience_replay_buffer, model1, model2, epsilon) episode_rewards[i] = episode_reward last_100_avg = episode_rewards[max(0, i - 100):i + 1].mean() print("Episode:", i, "Duration:", duration, "Num steps:", num_steps_in_episode, "Reward:", episode_reward,