예제 #1
0
def IGA(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config):
    pi_alpha_history = [pi_alpha]
    pi_beta_history = [pi_beta]
    pi_alpha_gradient_history = [0.0]
    pi_beta_gradient_history = [0.0]
    converge_step = 0
    for i in range(config["iteration"]):
        pi_alpha_gradient = pi_beta * u_alpha + payoff_0[(0, 1)] - payoff_0[
            (1, 1)]
        pi_beta_gradient = pi_alpha * u_beta + payoff_1[(1, 0)] - payoff_1[(1,
                                                                            1)]
        pi_alpha_next = pi_alpha + config["lr"] * pi_alpha_gradient
        pi_beta_next = pi_beta + config["lr"] * pi_beta_gradient
        pi_alpha = max(0.0, min(1.0, pi_alpha_next))
        pi_beta = max(0.0, min(1.0, pi_beta_next))
        ########### END TODO ###############################
        pi_alpha_gradient_history.append(pi_alpha_gradient)
        pi_beta_gradient_history.append(pi_beta_gradient)
        pi_alpha_history.append(pi_alpha)
        pi_beta_history.append(pi_beta)

        if converge_step == 0 and convergence_check(pi_alpha, pi_beta,
                                                    config["target_nash"]):
            converge_step = i
            break
    return (
        pi_alpha_history,
        pi_beta_history,
        pi_alpha_gradient_history,
        pi_beta_gradient_history,
        converge_step,
    )
예제 #2
0
def IGA_PP(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config):
    pi_alpha_history = [pi_alpha]
    pi_beta_history = [pi_beta]
    pi_alpha_gradient_history = [0.0]
    pi_beta_gradient_history = [0.0]
    converge_step = 0
    for i in range(config["iteration"]):
        pi_beta_pp = pi_beta + config["gamma"] * (pi_alpha * u_beta + payoff_1[
            (1, 0)] - payoff_1[(1, 1)])
        pi_alpha_gradient = pi_beta_pp * u_alpha + payoff_0[(0, 1)] - payoff_0[
            (1, 1)]
        pi_alpha_next = pi_alpha + config["lr"] * pi_alpha_gradient
        # kl_alpha = kl(pi_alpha, pi_alpha_next)
        pi_alpha_next = pi_alpha + config["lr"] * (
            pi_alpha_gradient  # + config["kl_coeff"] * kl_alpha
        )
        if not config["single"]:
            pi_alpha_pp = pi_alpha + config["gamma"] * (
                pi_beta * u_alpha + payoff_0[(0, 1)] - payoff_0[(1, 1)])
            pi_beta_gradient = (pi_alpha_pp * u_beta + payoff_1[(1, 0)] -
                                payoff_1[(1, 1)])
            pi_beta_next = pi_beta + config["lr"] * pi_beta_gradient
            # kl_beta = kl(pi_beta_pp, pi_beta_next)
            pi_beta_next = pi_beta + config["lr"] * (
                pi_beta_gradient  # + config["kl_coeff"]  * kl_beta
            )
        else:
            pi_beta_gradient = pi_alpha * u_beta + payoff_1[(1, 0)] - payoff_1[
                (1, 1)]
            pi_beta_next = pi_beta + config["lr"] * pi_beta_gradient
            # kl_beta = kl(pi_beta_pp, pi_beta_next)
            pi_beta_next = pi_beta + config["lr"] * (
                pi_beta_gradient  # + config["kl_coeff"]* kl_beta
            )

        pi_alpha_gradient_history.append(pi_alpha_gradient)
        pi_beta_gradient_history.append(pi_beta_gradient)
        pi_alpha = max(0.0, min(1.0, pi_alpha_next))
        pi_beta = max(0.0, min(1.0, pi_beta_next))
        pi_alpha_history.append(pi_alpha)
        pi_beta_history.append(pi_beta)
        if converge_step == 0 and convergence_check(pi_alpha, pi_beta,
                                                    config["target_nash"]):
            converge_step = i
            break
    return (
        pi_alpha_history,
        pi_beta_history,
        pi_alpha_gradient_history,
        pi_beta_gradient_history,
        converge_step,
    )
예제 #3
0
파일: wolf.py 프로젝트: matrl-project/matrl
def WoLF_IGA3(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config):

    pi_alpha_history = [pi_alpha]
    pi_beta_history = [pi_beta]
    pi_alpha_gradient_history = [0.0]
    pi_beta_gradient_history = [0.0]
    converge_step = 0
    for i in range(config["iteration"]):
        lr_alpha = config["lr_max"]
        lr_beta = config["lr_max"]

        if V(pi_alpha, pi_beta, payoff_0) > V(
            config["target_nash"][0][0], pi_beta, payoff_0
        ):
            lr_alpha = config["lr_min"]
        if V(pi_alpha, pi_beta, payoff_1) > V(
            pi_alpha, config["target_nash"][0][1], payoff_0
        ):
            lr_beta = config["lr_min"]
        pi_alpha_gradient = pi_beta * u_alpha + payoff_0[(0, 1)] - payoff_0[(1, 1)]
        pi_beta_gradient = pi_alpha * u_beta + payoff_1[(1, 0)] - payoff_1[(1, 1)]
        pi_alpha_next = pi_alpha + lr_alpha * pi_alpha_gradient
        pi_beta_next = pi_beta + lr_beta * pi_beta_gradient
        pi_alpha = max(0.0, min(1.0, pi_alpha_next))
        pi_beta = max(0.0, min(1.0, pi_beta_next))
        pi_alpha_gradient_history.append(pi_alpha_gradient)
        pi_beta_gradient_history.append(pi_beta_gradient)
        pi_alpha_history.append(pi_alpha)
        pi_beta_history.append(pi_beta)
        if converge_step == 0 and convergence_check(
            pi_alpha, pi_beta, config["target_nash"]
        ):
            converge_step = i
    return (
        pi_alpha_history,
        pi_beta_history,
        pi_alpha_gradient_history,
        pi_beta_gradient_history,
        converge_step,
    )
예제 #4
0
def IGA_TRPO(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config):
    pi_alpha_history = []
    pi_beta_history = []
    pi_alpha_gradient_history = []
    pi_beta_gradient_history = []
    meta_strategies = []
    converge_step = 0
    pi_alpha_raw = []
    pi_beta_raw = []
    pi_alpha_no_meta = []
    pi_beta_no_meta = []
    for i in range(config["iteration"]):
        # calculate the gradient
        pi_alpha_1, pi_beta_1, grad_alpha, grad_beta, pi_wo_kl = get_next_policy_trpo(
            pi_alpha,
            pi_beta,
            u_alpha,
            u_beta,
            payoff_0,
            payoff_1,
            config["lr"],
            config["kl_coeff"],
        )
        pi_alpha_raw.append(pi_wo_kl[0])
        pi_beta_raw.append(pi_wo_kl[1])
        # clip the probability, this is the range of  probability,
        pi_alpha_1 = max(0.0, min(1.0, pi_alpha_1))
        pi_beta_1 = max(0.0, min(1.0, pi_beta_1))
        pi_alpha_no_meta.append(pi_alpha_1)
        pi_beta_no_meta.append(pi_beta_1)
        pi_alpha_next, pi_beta_next, meta_strategy = get_nash_next(
            pi_alpha,
            pi_beta,
            pi_alpha_1,
            pi_beta_1,
            payoff_0,
            payoff_1,
            config["kl_coeff"],
        )
        meta_strategies.append(meta_strategy)
        BR_pi_alpha_gradient = (pi_beta_next * u_alpha + payoff_0[(0, 1)] -
                                payoff_0[(1, 1)])
        BR_pi_beta_gradient = (pi_alpha_next * u_beta + payoff_1[(1, 0)] -
                               payoff_1[(1, 1)])
        BR_pi_alpha_next = pi_alpha + config["br_lr"] * BR_pi_alpha_gradient
        BR_pi_beta_next = pi_beta + config["br_lr"] * BR_pi_beta_gradient
        pi_alpha = max(0.0, min(1.0, BR_pi_alpha_next))
        pi_beta = max(0.0, min(1.0, BR_pi_beta_next))
        pi_alpha_gradient_history.append(grad_alpha)
        pi_beta_gradient_history.append(grad_beta)
        pi_alpha_history.append(pi_alpha)
        pi_beta_history.append(pi_beta)
        if converge_step == 0 and convergence_check(pi_alpha, pi_beta,
                                                    config["target_nash"]):
            converge_step = i
            break

    return (
        pi_alpha_history,
        pi_beta_history,
        pi_alpha_gradient_history,
        pi_beta_gradient_history,
        meta_strategies,
        converge_step,
        # pi_alpha_raw,
        # pi_beta_raw,
        # pi_alpha_no_meta,
        # pi_beta_no_meta,
    )