def IGA(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config): pi_alpha_history = [pi_alpha] pi_beta_history = [pi_beta] pi_alpha_gradient_history = [0.0] pi_beta_gradient_history = [0.0] converge_step = 0 for i in range(config["iteration"]): pi_alpha_gradient = pi_beta * u_alpha + payoff_0[(0, 1)] - payoff_0[ (1, 1)] pi_beta_gradient = pi_alpha * u_beta + payoff_1[(1, 0)] - payoff_1[(1, 1)] pi_alpha_next = pi_alpha + config["lr"] * pi_alpha_gradient pi_beta_next = pi_beta + config["lr"] * pi_beta_gradient pi_alpha = max(0.0, min(1.0, pi_alpha_next)) pi_beta = max(0.0, min(1.0, pi_beta_next)) ########### END TODO ############################### pi_alpha_gradient_history.append(pi_alpha_gradient) pi_beta_gradient_history.append(pi_beta_gradient) pi_alpha_history.append(pi_alpha) pi_beta_history.append(pi_beta) if converge_step == 0 and convergence_check(pi_alpha, pi_beta, config["target_nash"]): converge_step = i break return ( pi_alpha_history, pi_beta_history, pi_alpha_gradient_history, pi_beta_gradient_history, converge_step, )
def IGA_PP(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config): pi_alpha_history = [pi_alpha] pi_beta_history = [pi_beta] pi_alpha_gradient_history = [0.0] pi_beta_gradient_history = [0.0] converge_step = 0 for i in range(config["iteration"]): pi_beta_pp = pi_beta + config["gamma"] * (pi_alpha * u_beta + payoff_1[ (1, 0)] - payoff_1[(1, 1)]) pi_alpha_gradient = pi_beta_pp * u_alpha + payoff_0[(0, 1)] - payoff_0[ (1, 1)] pi_alpha_next = pi_alpha + config["lr"] * pi_alpha_gradient # kl_alpha = kl(pi_alpha, pi_alpha_next) pi_alpha_next = pi_alpha + config["lr"] * ( pi_alpha_gradient # + config["kl_coeff"] * kl_alpha ) if not config["single"]: pi_alpha_pp = pi_alpha + config["gamma"] * ( pi_beta * u_alpha + payoff_0[(0, 1)] - payoff_0[(1, 1)]) pi_beta_gradient = (pi_alpha_pp * u_beta + payoff_1[(1, 0)] - payoff_1[(1, 1)]) pi_beta_next = pi_beta + config["lr"] * pi_beta_gradient # kl_beta = kl(pi_beta_pp, pi_beta_next) pi_beta_next = pi_beta + config["lr"] * ( pi_beta_gradient # + config["kl_coeff"] * kl_beta ) else: pi_beta_gradient = pi_alpha * u_beta + payoff_1[(1, 0)] - payoff_1[ (1, 1)] pi_beta_next = pi_beta + config["lr"] * pi_beta_gradient # kl_beta = kl(pi_beta_pp, pi_beta_next) pi_beta_next = pi_beta + config["lr"] * ( pi_beta_gradient # + config["kl_coeff"]* kl_beta ) pi_alpha_gradient_history.append(pi_alpha_gradient) pi_beta_gradient_history.append(pi_beta_gradient) pi_alpha = max(0.0, min(1.0, pi_alpha_next)) pi_beta = max(0.0, min(1.0, pi_beta_next)) pi_alpha_history.append(pi_alpha) pi_beta_history.append(pi_beta) if converge_step == 0 and convergence_check(pi_alpha, pi_beta, config["target_nash"]): converge_step = i break return ( pi_alpha_history, pi_beta_history, pi_alpha_gradient_history, pi_beta_gradient_history, converge_step, )
def WoLF_IGA3(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config): pi_alpha_history = [pi_alpha] pi_beta_history = [pi_beta] pi_alpha_gradient_history = [0.0] pi_beta_gradient_history = [0.0] converge_step = 0 for i in range(config["iteration"]): lr_alpha = config["lr_max"] lr_beta = config["lr_max"] if V(pi_alpha, pi_beta, payoff_0) > V( config["target_nash"][0][0], pi_beta, payoff_0 ): lr_alpha = config["lr_min"] if V(pi_alpha, pi_beta, payoff_1) > V( pi_alpha, config["target_nash"][0][1], payoff_0 ): lr_beta = config["lr_min"] pi_alpha_gradient = pi_beta * u_alpha + payoff_0[(0, 1)] - payoff_0[(1, 1)] pi_beta_gradient = pi_alpha * u_beta + payoff_1[(1, 0)] - payoff_1[(1, 1)] pi_alpha_next = pi_alpha + lr_alpha * pi_alpha_gradient pi_beta_next = pi_beta + lr_beta * pi_beta_gradient pi_alpha = max(0.0, min(1.0, pi_alpha_next)) pi_beta = max(0.0, min(1.0, pi_beta_next)) pi_alpha_gradient_history.append(pi_alpha_gradient) pi_beta_gradient_history.append(pi_beta_gradient) pi_alpha_history.append(pi_alpha) pi_beta_history.append(pi_beta) if converge_step == 0 and convergence_check( pi_alpha, pi_beta, config["target_nash"] ): converge_step = i return ( pi_alpha_history, pi_beta_history, pi_alpha_gradient_history, pi_beta_gradient_history, converge_step, )
def IGA_TRPO(pi_alpha, pi_beta, payoff_0, payoff_1, u_alpha, u_beta, config): pi_alpha_history = [] pi_beta_history = [] pi_alpha_gradient_history = [] pi_beta_gradient_history = [] meta_strategies = [] converge_step = 0 pi_alpha_raw = [] pi_beta_raw = [] pi_alpha_no_meta = [] pi_beta_no_meta = [] for i in range(config["iteration"]): # calculate the gradient pi_alpha_1, pi_beta_1, grad_alpha, grad_beta, pi_wo_kl = get_next_policy_trpo( pi_alpha, pi_beta, u_alpha, u_beta, payoff_0, payoff_1, config["lr"], config["kl_coeff"], ) pi_alpha_raw.append(pi_wo_kl[0]) pi_beta_raw.append(pi_wo_kl[1]) # clip the probability, this is the range of probability, pi_alpha_1 = max(0.0, min(1.0, pi_alpha_1)) pi_beta_1 = max(0.0, min(1.0, pi_beta_1)) pi_alpha_no_meta.append(pi_alpha_1) pi_beta_no_meta.append(pi_beta_1) pi_alpha_next, pi_beta_next, meta_strategy = get_nash_next( pi_alpha, pi_beta, pi_alpha_1, pi_beta_1, payoff_0, payoff_1, config["kl_coeff"], ) meta_strategies.append(meta_strategy) BR_pi_alpha_gradient = (pi_beta_next * u_alpha + payoff_0[(0, 1)] - payoff_0[(1, 1)]) BR_pi_beta_gradient = (pi_alpha_next * u_beta + payoff_1[(1, 0)] - payoff_1[(1, 1)]) BR_pi_alpha_next = pi_alpha + config["br_lr"] * BR_pi_alpha_gradient BR_pi_beta_next = pi_beta + config["br_lr"] * BR_pi_beta_gradient pi_alpha = max(0.0, min(1.0, BR_pi_alpha_next)) pi_beta = max(0.0, min(1.0, BR_pi_beta_next)) pi_alpha_gradient_history.append(grad_alpha) pi_beta_gradient_history.append(grad_beta) pi_alpha_history.append(pi_alpha) pi_beta_history.append(pi_beta) if converge_step == 0 and convergence_check(pi_alpha, pi_beta, config["target_nash"]): converge_step = i break return ( pi_alpha_history, pi_beta_history, pi_alpha_gradient_history, pi_beta_gradient_history, meta_strategies, converge_step, # pi_alpha_raw, # pi_beta_raw, # pi_alpha_no_meta, # pi_beta_no_meta, )