def log_prob(w, D): agent = RWSoftmaxAgent(task=MyBanditTask(), learning_rate=sigmoid(w[0]), inverse_softmax_temp=stable_exp(w[1])) for t in range(D.shape[0]): x = D[t, :7] u = D[t, 7:11] r = D[t, 11] x_ = D[t, 12:] agent.log_prob(x, u) agent.learning(x, u, r, x_, None) J = np.diag([grad.sigmoid(w[0]), grad.exp(w[1])]) return -agent.logprob_, -J @ agent.grad_
def log_prob(w, D): agent = RWSoftmaxAgent(task=MyBanditTask(), learning_rate=w[0], inverse_softmax_temp=w[1]) L = 0 for t in range(D.shape[0]): x = D[t, :7] u = D[t, 7:11] r = D[t, 11] x_ = D[t, 12:] L += u @ agent.log_prob(x) agent.learning(x, u, r, x_, None) return L
def log_prob(w, D): lr = sigmoid(w[0], a_min=-6, a_max=6) ist = relu(w[1], a_max=10) agent = RWSoftmaxAgent(TwoArmedBandit(), lr, ist) L = 0 for t in range(D.shape[0]): x = D[t, :3] u = D[t, 3:5] r = D[t, 5] x_ = D[t, 6:] L += u @ agent.log_prob(x) agent.learning(x, u, r, x_, None) return L
def log_prob(w, D): lr = sigmoid(w[0], a_min=-6, a_max=6) ist = stable_exp(w[1], a_min=-10, a_max=10) agent = RWSoftmaxAgent(TwoArmedBandit(), lr, ist) L = 0 for t in range(D.shape[0]): x = D[t, :3] u = D[t, 3:5] r = D[t, 5] x_ = D[t, 6:] agent.log_prob(x, u) agent.learning(x, u, r, x_, None) J = np.array([grad.sigmoid(w[0]), grad.exp(w[1])]) return -agent.logprob_, -J * agent.grad_,
def rwsoftmax_loglik(w, D): X1, U1, R, X2 = D[:, :nx], D[:, nx:nx + nu], D[:, nx + nu], D[:, nx + nu + 1:nx + nu + 1 + nx] w = fu.transform(w, [fu.sigmoid, np.exp]).flatten() J = reparam_jac_rwsm(w) q = RWSoftmaxAgent(task=task(), learning_rate=w[0], inverse_softmax_temp=w[1]) ntrials = X1.shape[0] for t in range(ntrials): q.log_prob(X1[t], U1[t]) q.learning(X1[t], U1[t], R[t], X2[t], None) L = q.logprob_ return -L, -J @ q.grad_, -J.T @ q.hess_ @ J
def test_rwsoftmaxagent(): lr = 0.1 B = 1.5 task = TwoArmedBandit() q = RWSoftmaxAgent(task, learning_rate=lr, inverse_softmax_temp=B) x = np.array([1., 0., 0.]) u1 = np.array([1., 0.]) u2 = np.array([0., 1.]) x_1 = np.array([0., 1., 0.]) x_2 = np.array([0., 0., 1.]) r1 = 1.0 r2 = 0.0 q.log_prob(x, u1) q.learning(x, u1, r1, x_1, None) q.log_prob(x, u2) q.learning(x, u2, r2, x_2, None) q.log_prob(x, u2) q.learning(x, u2, r1, x_1, None) q.log_prob(x, u1) q.learning(x, u1, r2, x_2, None) q.log_prob(x, u1) q.learning(x, u1, r1, x_1, None) fitr_grad = q.grad_ fitr_hess = q.hess_ def f(w): m = RWSoftmaxAgent(task, learning_rate=w[0], inverse_softmax_temp=w[1]) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r1, x_1, None) m._log_prob_noderivatives(x, u2) m.critic._update_noderivatives(x, u2, r2, x_2, None) m._log_prob_noderivatives(x, u2) m.critic._update_noderivatives(x, u2, r1, x_1, None) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r2, x_2, None) m._log_prob_noderivatives(x, u1) m.critic._update_noderivatives(x, u1, r1, x_1, None) return m.logprob_ agJ = jacobian(f)(np.array([lr, B])) agH = hessian(f)(np.array([lr, B])) assert (np.linalg.norm(agJ - q.grad_)) assert (np.linalg.norm(agH - q.hess_))
def f(): agent = RWSoftmaxAgent(task, learning_rate=0.4, inverse_softmax_temp=2.6) for t in range(X.shape[0]): agent.log_prob(X[t], U[t]) agent.learning(X[t], U[t], R[t], X_[t], None) return agent.logprob_