def gradients(dist, y, payoff_matrices, num_players, p=1, proj_grad=True): """Computes exploitablity gradient and aux variable gradients. Args: dist: 1-d np.array, current estimate of nash distribution y: 1-d np.array (same shape as dist), current estimate of payoff gradient payoff_matrices: (>=2 x A x A) np.array, payoffs for each joint action num_players: int, number of players, in case payoff_matrices is abbreviated p: float in [0, 1], Tsallis entropy-regularization --> 0 as p --> 0 proj_grad: bool, if True, projects dist gradient onto simplex Returns: gradient of exploitability w.r.t. (dist, y) as tuple unregularized exploitability (stochastic estimate) tsallis regularized exploitability (stochastic estimate) """ nabla = payoff_matrices[0].dot(dist) if p > 0: power = 1. / float(p) s = np.linalg.norm(y, ord=power) if s == 0: br = misc.uniform_dist(y) else: br = (y / s)**power else: power = np.inf s = np.linalg.norm(y, ord=power) br = np.zeros_like(dist) maxima = (y == s) br[maxima] = 1. / maxima.sum() unreg_exp = np.max(y) - y.dot(dist) br_inv_sparse = 1 - np.sum(br**(p + 1)) dist_inv_sparse = 1 - np.sum(dist**(p + 1)) entr_br = s / (p + 1) * br_inv_sparse entr_dist = s / (p + 1) * dist_inv_sparse reg_exp = y.dot(br - dist) + entr_br - entr_dist entr_br_vec = br_inv_sparse * br**(1 - p) entr_dist_vec = dist_inv_sparse * dist**(1 - p) policy_gradient = nabla - s * dist**p other_player_fx = (br - dist) + 1 / (p + 1) * (entr_br_vec - entr_dist_vec) other_player_fx_translated = payoff_matrices[1].dot(other_player_fx) grad_dist = -policy_gradient + (num_players - 1) * other_player_fx_translated if proj_grad: grad_dist = simplex.project_grad(grad_dist) grad_y = y - nabla return (grad_dist, grad_y), unreg_exp, reg_exp
def cheap_gradients(self, random, dist, y, anneal_steps, payoff_matrices, num_players, p=1, proj_grad=True): """Computes exploitablity gradient and aux variable gradients with samples. This implementation takes payoff_matrices as input so technically uses O(d^2) compute but only a single column of payoff_matrices is used to perform the update so can be re-implemented in O(d) if needed. Args: random: random number generator, np.random.RandomState(seed) dist: list of 1-d np.arrays, current estimate of nash distribution y: list 1-d np.arrays (same shape as dist), current est. of payoff grad anneal_steps: int, elapsed num steps since last anneal payoff_matrices: dictionary with keys as tuples of agents (i, j) and values of (2 x A x A) np.arrays, payoffs for each joint action. keys are sorted and arrays should be indexed in the same order num_players: int, number of players, in case payoff_matrices is abbrev'd p: float in [0, 1], Tsallis entropy-regularization --> 0 as p --> 0 proj_grad: bool, if True, projects dist gradient onto simplex Returns: gradient of exploitability w.r.t. (dist, y) as tuple unregularized exploitability (stochastic estimate) tsallis regularized exploitability (stochastic estimate) """ # first compute policy gradients and player effects (fx) policy_gradient = [] other_player_fx = [] grad_y = [] unreg_exp = [] reg_exp = [] for i in range(num_players): others = list(range(num_players)) others.remove(i) j = np.random.choice(others) action_j = random.choice(dist[j].size, p=dist[j]) if i < j: hess_i_ij = payoff_matrices[(i, j)][0] else: hess_i_ij = payoff_matrices[(j, i)][1].T nabla_i = hess_i_ij[:, action_j] grad_y.append(y[i] - nabla_i) if p > 1e-2: # encounter numerical under/overflow when power > 100. power = 1. / float(p) s_i = np.linalg.norm(y[i], ord=power) if s_i == 0: br_i = misc.uniform_dist(y[i]) else: br_i = (y[i] / s_i)**power else: power = np.inf s_i = np.linalg.norm(y[i], ord=power) br_i = np.zeros_like(dist[i]) maxima_i = (y[i] == s_i) br_i[maxima_i] = 1. / maxima_i.sum() policy_gradient_i = nabla_i - s_i * dist[i]**p policy_gradient.append(policy_gradient_i) unreg_exp.append(np.max(y[i]) - y[i].dot(dist[i])) br_i_inv_sparse = 1 - np.sum(br_i**(p + 1)) dist_i_inv_sparse = 1 - np.sum(dist[i]**(p + 1)) entr_br_i = s_i / (p + 1) * br_i_inv_sparse entr_dist_i = s_i / (p + 1) * dist_i_inv_sparse reg_exp.append(y[i].dot(br_i - dist[i]) + entr_br_i - entr_dist_i) entr_br_vec_i = br_i_inv_sparse * br_i**(1 - p) entr_dist_vec_i = dist_i_inv_sparse * dist[i]**(1 - p) other_player_fx_i = (br_i - dist[i]) + 1 / (p + 1) * ( entr_br_vec_i - entr_dist_vec_i) other_player_fx.append(other_player_fx_i) # then construct exploitability gradient grad_dist = [] for i in range(num_players): grad_dist_i = -policy_gradient[i] for j in range(num_players): if j == i: continue if i < j: hess_j_ij = payoff_matrices[(i, j)][1] else: hess_j_ij = payoff_matrices[(j, i)][0].T action_u = random.choice( dist[j].size) # uniform, ~importance sampling other_player_fx_j = dist[j].size * other_player_fx[j][action_u] grad_dist_i += hess_j_ij[:, action_u] * other_player_fx_j if proj_grad: grad_dist_i = simplex.project_grad(grad_dist_i) grad_dist.append(grad_dist_i) unreg_exp_mean = np.mean(unreg_exp) reg_exp_mean = np.mean(reg_exp) _, lr_y = self.lrs if (reg_exp_mean < self.exp_thresh) and (anneal_steps >= 1 / lr_y): self.p = np.clip(p / 2., 0., 1.) grad_anneal_steps = -anneal_steps else: grad_anneal_steps = 1 return (grad_dist, grad_y, grad_anneal_steps), unreg_exp_mean, reg_exp_mean
def cheap_gradients_vr(random, dist, y, payoff_matrices, num_players, pm_vr, p=1, proj_grad=True, version=0): """Computes exploitablity gradient and aux variable gradients with samples. This implementation takes payoff_matrices as input so technically uses O(d^2) compute but only a single column of payoff_matrices is used to perform the update so can be re-implemented in O(d) if needed. Args: random: random number generator, np.random.RandomState(seed) dist: 1-d np.array, current estimate of nash distribution y: 1-d np.array (same shape as dist), current estimate of payoff gradient payoff_matrices: (>=2 x A x A) np.array, payoffs for each joint action num_players: int, number of players, in case payoff_matrices is abbreviated pm_vr: approximate payoff_matrix for variance reduction p: float in [0, 1], Tsallis entropy-regularization --> 0 as p --> 0 proj_grad: bool, if True, projects dist gradient onto simplex version: int, default 0, two options for variance reduction Returns: gradient of exploitability w.r.t. (dist, y) as tuple unregularized exploitability (stochastic estimate) tsallis regularized exploitability (stochastic estimate) """ if pm_vr is None: raise ValueError( "pm_vr must be np.array of shape (num_strats, num_strats)") if (not isinstance(version, int)) or (version < 0) or (version > 1): raise ValueError("version must be non-negative int < 2") action_1 = random.choice(dist.size, p=dist) nabla = payoff_matrices[0][:, action_1] if p > 0: power = 1. / float(p) s = np.linalg.norm(y, ord=power) if s == 0: br = misc.uniform_dist(y) else: br = (y / s)**power else: power = np.inf s = np.linalg.norm(y, ord=power) br = np.zeros_like(dist) maxima = (y == s) br[maxima] = 1. / maxima.sum() unreg_exp = np.max(y) - y.dot(dist) br_inv_sparse = 1 - np.sum(br**(p + 1)) dist_inv_sparse = 1 - np.sum(dist**(p + 1)) entr_br = s / (p + 1) * br_inv_sparse entr_dist = s / (p + 1) * dist_inv_sparse reg_exp = y.dot(br - dist) + entr_br - entr_dist entr_br_vec = br_inv_sparse * br**(1 - p) entr_dist_vec = dist_inv_sparse * dist**(1 - p) policy_gradient = nabla - s * dist**p other_player_fx = (br - dist) + 1 / (p + 1) * (entr_br_vec - entr_dist_vec) if version == 0: other_player_fx_translated = pm_vr.dot(other_player_fx) action_u = random.choice(dist.size) # uniform, ~importance sampling other_player_fx = other_player_fx[action_u] pm_mod = dist.size * (payoff_matrices[1, :, action_u] - pm_vr[:, action_u]) other_player_fx_translated += pm_mod * other_player_fx elif version == 1: other_player_fx_translated = np.sum(pm_vr, axis=1) action_u = random.choice(dist.size) # uniform, ~importance sampling other_player_fx = other_player_fx[action_u] pm_mod = dist.size * payoff_matrices[1, :, action_u] r = dist.size * pm_vr[:, action_u] other_player_fx_translated += pm_mod * other_player_fx - r grad_dist = -policy_gradient + (num_players - 1) * other_player_fx_translated if proj_grad: grad_dist = simplex.project_grad(grad_dist) grad_y = y - nabla if version == 0: pm_vr[:, action_u] = payoff_matrices[1, :, action_u] elif version == 1: pm_vr[:, action_u] = payoff_matrices[1, :, action_u] * other_player_fx return (grad_dist, grad_y), pm_vr, unreg_exp, reg_exp
def cheap_gradients(random, dist, y, payoff_matrices, num_players, p=1, proj_grad=True): """Computes exploitablity gradient and aux variable gradients with samples. This implementation takes payoff_matrices as input so technically uses O(d^2) compute but only a single column of payoff_matrices is used to perform the update so can be re-implemented in O(d) if needed. Args: random: random number generator, np.random.RandomState(seed) dist: 1-d np.array, current estimate of nash distribution y: 1-d np.array (same shape as dist), current estimate of payoff gradient payoff_matrices: (>=2 x A x A) np.array, payoffs for each joint action num_players: int, number of players, in case payoff_matrices is abbreviated p: float in [0, 1], Tsallis entropy-regularization --> 0 as p --> 0 proj_grad: bool, if True, projects dist gradient onto simplex Returns: gradient of exploitability w.r.t. (dist, y) as tuple unregularized exploitability (stochastic estimate) tsallis regularized exploitability (stochastic estimate) """ action_1 = random.choice(dist.size, p=dist) nabla = payoff_matrices[0][:, action_1] if p > 0: power = 1. / float(p) s = np.linalg.norm(y, ord=power) if s == 0: br = misc.uniform_dist(y) else: br = (y / s)**power else: power = np.inf s = np.linalg.norm(y, ord=power) br = np.zeros_like(dist) maxima = (y == s) br[maxima] = 1. / maxima.sum() unreg_exp = np.max(y) - y.dot(dist) br_inv_sparse = 1 - np.sum(br**(p + 1)) dist_inv_sparse = 1 - np.sum(dist**(p + 1)) entr_br = s / (p + 1) * br_inv_sparse entr_dist = s / (p + 1) * dist_inv_sparse reg_exp = y.dot(br - dist) + entr_br - entr_dist entr_br_vec = br_inv_sparse * br**(1 - p) entr_dist_vec = dist_inv_sparse * dist**(1 - p) policy_gradient = nabla - s * dist**p other_player_fx = (br - dist) + 1 / (p + 1) * (entr_br_vec - entr_dist_vec) action_u = random.choice(dist.size) # uniform, ~importance sampling other_player_fx = dist.size * other_player_fx[action_u] other_player_fx_translated = payoff_matrices[1, :, action_u] * other_player_fx grad_dist = -policy_gradient + (num_players - 1) * other_player_fx_translated if proj_grad: grad_dist = simplex.project_grad(grad_dist) grad_y = y - nabla return (grad_dist, grad_y), unreg_exp, reg_exp
def gradients(dist, y, payoff_matrices, num_players, p=1, proj_grad=True): """Computes exploitablity gradient and aux variable gradients. Args: dist: list of 1-d np.arrays, current estimate of nash distribution y: list 1-d np.arrays (same shape as dist), current est. of payoff gradient payoff_matrices: dictionary with keys as tuples of agents (i, j) and values of (2 x A x A) np.arrays, payoffs for each joint action. keys are sorted and arrays should be indexed in the same order num_players: int, number of players, in case payoff_matrices is abbreviated p: float in [0, 1], Tsallis entropy-regularization --> 0 as p --> 0 proj_grad: bool, if True, projects dist gradient onto simplex Returns: gradient of exploitability w.r.t. (dist, y) as tuple unregularized exploitability (stochastic estimate) tsallis regularized exploitability (stochastic estimate) """ # first compute policy gradients and player effects (fx) policy_gradient = [] other_player_fx = [] grad_y = [] unreg_exp = [] reg_exp = [] for i in range(num_players): nabla_i = np.zeros_like(dist[i]) for j in range(num_players): if j == i: continue if i < j: hess_i_ij = payoff_matrices[(i, j)][0] else: hess_i_ij = payoff_matrices[(j, i)][1].T nabla_ij = hess_i_ij.dot(dist[j]) nabla_i += nabla_ij / float(num_players - 1) grad_y.append(y[i] - nabla_i) if p > 0: power = 1. / float(p) s_i = np.linalg.norm(y[i], ord=power) if s_i == 0: br_i = misc.uniform_dist(y[i]) else: br_i = (y[i] / s_i)**power else: power = np.inf s_i = np.linalg.norm(y[i], ord=power) br_i = np.zeros_like(dist[i]) maxima_i = (y[i] == s_i) br_i[maxima_i] = 1. / maxima_i.sum() policy_gradient_i = nabla_i - s_i * dist[i]**p policy_gradient.append(policy_gradient_i) unreg_exp.append(np.max(y[i]) - y[i].dot(dist[i])) br_i_inv_sparse = 1 - np.sum(br_i**(p + 1)) dist_i_inv_sparse = 1 - np.sum(dist[i]**(p + 1)) entr_br_i = s_i / (p + 1) * br_i_inv_sparse entr_dist_i = s_i / (p + 1) * dist_i_inv_sparse reg_exp.append(y[i].dot(br_i - dist[i]) + entr_br_i - entr_dist_i) entr_br_vec_i = br_i_inv_sparse * br_i**(1 - p) entr_dist_vec_i = dist_i_inv_sparse * dist[i]**(1 - p) other_player_fx_i = ( br_i - dist[i]) + 1 / (p + 1) * (entr_br_vec_i - entr_dist_vec_i) other_player_fx.append(other_player_fx_i) # then construct exploitability gradient grad_dist = [] for i in range(num_players): grad_dist_i = -policy_gradient[i] for j in range(num_players): if j == i: continue if i < j: hess_j_ij = payoff_matrices[(i, j)][1] else: hess_j_ij = payoff_matrices[(j, i)][0].T grad_dist_i += hess_j_ij.dot(other_player_fx[j]) if proj_grad: grad_dist_i = simplex.project_grad(grad_dist_i) grad_dist.append(grad_dist_i) return (grad_dist, grad_y), np.mean(unreg_exp), np.mean(reg_exp)
def gradients(self, dist, y, anneal_steps, payoff_matrices, num_players, p=1, proj_grad=True): """Computes exploitablity gradient and aux variable gradients. Args: dist: 1-d np.array, current estimate of nash distribution y: 1-d np.array (same shape as dist), current estimate of payoff gradient anneal_steps: int, elapsed num steps since last anneal payoff_matrices: (>=2 x A x A) np.array, payoffs for each joint action num_players: int, number of players, in case payoff_matrices is abbreviated p: float in [0, 1], Tsallis entropy-regularization --> 0 as p --> 0 proj_grad: bool, if True, projects dist gradient onto simplex Returns: gradient of exploitability w.r.t. (dist, y, anneal_steps) as tuple unregularized exploitability (stochastic estimate) tsallis regularized exploitability (stochastic estimate) """ nabla = payoff_matrices[0].dot(dist) if p > 1e-2: # encounter numerical under/overflow when power > 100. power = 1. / float(p) s = np.linalg.norm(y, ord=power) if s == 0: br = misc.uniform_dist(y) else: br = (y / s)**power else: power = np.inf s = np.linalg.norm(y, ord=power) br = np.zeros_like(dist) maxima = (y == s) br[maxima] = 1. / maxima.sum() unreg_exp = np.max(y) - y.dot(dist) br_inv_sparse = 1 - np.sum(br**(p + 1)) dist_inv_sparse = 1 - np.sum(dist**(p + 1)) entr_br = s / (p + 1) * br_inv_sparse entr_dist = s / (p + 1) * dist_inv_sparse reg_exp = y.dot(br - dist) + entr_br - entr_dist entr_br_vec = br_inv_sparse * br**(1 - p) entr_dist_vec = dist_inv_sparse * dist**(1 - p) policy_gradient = nabla - s * dist**p other_player_fx = (br - dist) + 1 / (p + 1) * (entr_br_vec - entr_dist_vec) other_player_fx_translated = payoff_matrices[1].dot(other_player_fx) grad_dist = -policy_gradient grad_dist += (num_players - 1) * other_player_fx_translated if proj_grad: grad_dist = simplex.project_grad(grad_dist) grad_y = y - nabla _, lr_y = self.lrs if (reg_exp < self.exp_thresh) and (anneal_steps >= 1 / lr_y): self.p = np.clip(p / 2., 0., 1.) grad_anneal_steps = -anneal_steps else: grad_anneal_steps = 1 return (grad_dist, grad_y, grad_anneal_steps), unreg_exp, reg_exp