def log_prob(self, x): """ Computes the log-probability of an action $\mathbf u$ $$ \log p(\mathbf u|\mathbf v, \mathbf u_{t-1}) = \\big(\\beta \mathbf v + \\beta^\\rho \mathbf u_{t-1}) - \log \sum_{v_i} e^{\\beta \mathbf v_i + \\beta^\\rho u_{t-1}^{(i)}} $$ Arguments: x: State vector of type `ndarray((nactions,))` Returns: Scalar log-probability """ # Compute logits Bx = self.inverse_softmax_temp*x stickiness = self.perseveration*self.a_last self.logits = Bx + stickiness # Hessians HB, Hp, HBp, Hx, _ = hess.log_stickysoftmax(self.inverse_softmax_temp, self.perseveration, x, self.a_last) self.hess_logprob['inverse_softmax_temp'] = HB self.hess_logprob['perseveration'] = Hp self.hess_logprob['action_values'] = Hx self.hess_logprob['inverse_softmax_temp_perseveration'] = HBp # Derivatives # Grad LSE wrt Logits Dlse = grad.logsumexp(self.logits) # Grad logprob wrt logits self.d_logprob['logits'] = np.eye(x.size) - Dlse # Partial derivative with respect to inverse softmax temp self.d_logits['inverse_softmax_temp'] = x self.d_logits['perseveration'] = self.a_last self.d_logprob['inverse_softmax_temp'] = x - np.dot(Dlse, x) self.d_logprob['perseveration'] = self.a_last - np.dot(Dlse, self.a_last) # Gradient with respect to x B = np.eye(x.size)*self.inverse_softmax_temp Dlsetile = np.tile(self.inverse_softmax_temp*Dlse, [x.size, 1]) self.d_logprob['action_values'] = B - Dlsetile LSE = fu.logsumexp(self.logits) if not np.isfinite(LSE): LSE = 0. return self.logits - LSE
def log_prob(self, x): """ Computes the log-probability of an action $\mathbf u$, in addition to computing derivatives up to second order $$ \log p(\mathbf u|\mathbf v) = \\beta \mathbf v - \log \sum_{v_i} e^{\\beta \mathbf v_i} $$ Arguments: x: State vector of type `ndarray((nstates,))` Returns: Scalar log-probability """ # Compute logits self.logits = self.inverse_softmax_temp*x # Hessians HB, Hx = hess.log_softmax(self.inverse_softmax_temp, x) self.hess_logprob['inverse_softmax_temp'] = HB self.hess_logprob['action_values'] = Hx # Derivatives # Grad LSE wrt Logits Dlse = grad.logsumexp(self.logits) # Grad logprob wrt logits self.d_logprob['logits'] = np.eye(x.size) - Dlse # Grad logprob wrt inverse softmax temp self.d_logits['inverse_softmax_temp'] = x self.d_logprob['inverse_softmax_temp'] = np.dot(self.d_logprob['logits'], self.d_logits['inverse_softmax_temp']) # Grad logprob wrt action values `x` B = np.eye(x.size)*self.inverse_softmax_temp Dlsetile = np.tile(self.inverse_softmax_temp*Dlse, [x.size, 1]) self.d_logprob['action_values'] = B - Dlsetile # Compute log-probability of actions LSE = fu.logsumexp(self.logits) if not np.isfinite(LSE): LSE = 0. return self.logits - LSE
def test_logsumexp(): x = np.array([1., 0., 0.]) grad_fitr = grad.logsumexp(x) grad_autograd = gradient(utils.logsumexp)(x) grad_err = np.linalg.norm(grad_fitr - grad_autograd) assert (grad_err < 1e-6)
Q = np.zeros((env.nactions, env.nstates)) L = 0 a_last = np.zeros(env.nactions) for t in range(R.size): x = X[t] u = U[t] r = R[t] x_ = X_[t] u_ = U_[t] q = np.einsum('ij,j->i', Q, x) q_ = np.einsum('ij,j->i', Qmf, x_) logits1 = B1 * q + persev * a_last logits2 = B2 * q_ pu1 = fu.softmax(logits1) pu2 = fu.softmax(logits2) Dlp_logit1 = np.eye(q.size) - np.tile(grad.logsumexp(logits1), [q.size, 1]) Dlp_logit2 = np.eye(q_.size) - np.tile(grad.logsumexp(logits2), [q_.size, 1]) Dlogit1_q1 = B1 Dlogit2_q2 = B2 Dlogit1_B1 = q Dlogit2_B2 = q_ Dlogit1_persev = a_last Dlp_q1 = B1 * np.eye(q.size) - np.tile(B1 * grad.logsumexp(logits1), [q.size, 1]) Dlp_q2 = B2 * np.eye(q_.size) - np.tile(B2 * grad.logsumexp(logits2), [q_.size, 1]) Dq1_Q = x Dq2_Q = x_ Dq2_Qmf = x_ DQ_w = Qmb - Qmf