def _log_marginal_likelihood(self, thetas, X, y): try: length_scale, signal_sd, noise_sd, prior_sd = thetas noise_sd2 = np.sqrt(noise_sd**2 + self.c * prior_sd**2) basis = _basis(X, self.random_matrix, self.bias, self.basis_dim, np.abs(length_scale), np.abs(signal_sd)) N = len(basis.T) XX = np.matmul(basis.T, basis) Xy = np.matmul(basis.T, y) tmp0 = (noise_sd2 / prior_sd)**2 * np.eye(self.basis_dim) + XX Llower = la.cholesky(tmp0, lower=True) LinvXy = la.solve_triangular(Llower, Xy, lower=True) tmp = np.sum(np.square(LinvXy)) s, logdet = np.linalg.slogdet( np.eye(self.basis_dim) + (prior_sd / noise_sd2)**2 * XX) if s != 1: print('logdet is <= 0. Returning 10e100.') return 10e100 lml = .5 * (-N * np.log(noise_sd2**2) * self.output_dim - logdet * self.output_dim + (-np.sum(np.square(y)) + tmp) / noise_sd2**2) loss = -lml return loss except Exception as e: print('------------') print(e, 'Returning 10e100.') print('************') return 10e100
def update_hyperstate(agent, hyperstate, hyperparameters, datum, dim, learn_diff): state, action, reward, next_state, _ = [ np.atleast_2d(np.copy(dat)) for dat in datum ] Llowers, Xy = [list(ele) for ele in hyperstate] assert len(Llowers) == len(hyperparameters) assert len(Xy) == len(hyperparameters) assert len(hyperparameters) == dim state_action = np.concatenate([state, action], axis=-1) y = np.concatenate( [next_state - state if learn_diff else next_state, reward], axis=-1)[..., :dim] for i in range(len(Llowers)): Llowers[i] = Llowers[i].transpose([0, 2, 1]) for i, hp in zip(range(dim), hyperparameters): length_scale, signal_sd, noise_sd, prior_sd = hp basis = _basis(state_action, agent.random_matrices[i], agent.biases[i], agent.basis_dims[i], length_scale, signal_sd) cholupdate(Llowers[i][0], basis[0].copy()) Xy[i] += np.matmul(basis[:, None, :].transpose([0, 2, 1]), y[:, None, :][..., i:i + 1]) for i in range(len(Llowers)): Llowers[i] = Llowers[i].transpose([0, 2, 1]) return [Llowers, Xy]
def _predict(self, X): basis = _basis(X, self.random_matrix, self.bias, self.basis_dim, self.length_scale, self.signal_sd) predict_sigma = np.sum(np.square( la.solve_triangular(self.Llower, basis.T, lower=True)), axis=0) * self.noise_sd**2 + self.noise_sd**2 predict_sigma = predict_sigma[..., np.newaxis] tmp0 = la.solve_triangular(self.Llower, basis.T, lower=True).T tmp1 = la.solve_triangular(self.Llower, self.Xy, lower=True) predict_mu = np.matmul(tmp0, tmp1) return predict_mu, predict_sigma
def _update(self, X, y): assert len(X.shape) == 2 assert len(y.shape) == 2 assert X.shape[0] == y.shape[0] basis = _basis(X, self.random_matrix, self.bias, self.basis_dim, self.length_scale, self.signal_sd) self.XX += np.matmul(basis.T, basis) self.Xy += np.matmul(basis.T, y) #TODO: perform a rank-1 cholesky update? self.Llower = la.cholesky( (self.noise_sd / self.prior_sd)**2 * np.eye(self.basis_dim) + self.XX, lower=True)
def _reward(self, state, action, sess, Llower, Xy, hyperparameters): basis = None if self.environment == 'Pendulum-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(sess, state, action) elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0: reward = self.reward_function.build_np(state, action) else: state_action = np.concatenate([state, action], axis=-1) length_scale, signal_sd, noise_sd, prior_sd = hyperparameters basis = _basis(state_action, self.random_matrices[-1], self.biases[-1], self.basis_dims[-1], length_scale, signal_sd) basis = np.expand_dims(basis, axis=1) pred_mu, pred_sigma = self._predict(Llower, Xy, basis, noise_sd) if self.use_mean_reward == 1: pred_sigma = np.zeros_like(pred_sigma) reward = np.stack([ np.random.normal(loc=loc, scale=scale) for loc, scale in zip(pred_mu, pred_sigma) ], axis=0) return reward, basis
def _loss(self, thetas, X, Llowers, XXtr, Xytr, A=[], hyperparameters=None, sess=None): rng_state = np.random.get_state() X = np.copy(X) Llowers = [np.copy(ele) for ele in Llowers] XXtr = [np.copy(ele) for ele in XXtr] Xytr = [np.copy(ele) for ele in Xytr] hyperparameters = [np.copy(ele) for ele in hyperparameters] try: np.random.seed(2) rewards = [] state = X for unroll_step in xrange(self.unroll_steps): action = self._forward(thetas, state, hyperstate=[Llowers, Xytr]) reward, basis_reward = self._reward(state, action, sess, Llowers[-1], Xytr[-1], hyperparameters[-1]) rewards.append((self.discount_factor**unroll_step) * reward) state_action = np.concatenate([state, action], axis=-1) means = [] covs = [] bases = [] for i in xrange(self.state_dim): length_scale, signal_sd, noise_sd, prior_sd = hyperparameters[ i] basis = _basis(state_action, self.random_matrices[i], self.biases[i], self.basis_dims[i], length_scale, signal_sd) basis = np.expand_dims(basis, axis=1) bases.append(basis) pred_mu, pred_sigma = self._predict( Llowers[i], Xytr[i], basis, noise_sd) means.append(pred_mu) covs.append(pred_sigma) means = np.concatenate(means, axis=-1) covs = np.concatenate(covs, axis=-1) bases.append(basis_reward) state_ = np.stack([ np.random.multivariate_normal(mean=mean, cov=np.diag(cov)) for mean, cov in zip(means, covs) ], axis=0) state = state + state_ if self.learn_diff else state_ if self.learn_diff == 0: state_ = np.clip(state_, self.observation_space_low, self.observation_space_high) state = np.clip(state, self.observation_space_low, self.observation_space_high) # #Removable # import copy # Llowers2 = copy.deepcopy(Llowers) # Xytr2 = copy.deepcopy(Xytr) # XXtr2 = copy.deepcopy(XXtr) # #Removable -END- if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1: y = np.concatenate([state_, reward], axis=-1)[..., :self.state_dim + self.learn_reward] y = y[..., np.newaxis, np.newaxis] for i in xrange(self.state_dim + self.learn_reward): Llowers[i] = Llowers[i].transpose([0, 2, 1]) for i in xrange(self.state_dim + self.learn_reward): for j in xrange(len(Llowers[i])): cholupdate(Llowers[i][j], bases[i][j, 0].copy()) Xytr[i] += np.matmul(bases[i].transpose([0, 2, 1]), y[:, i, ...]) # #Removable # _, _, noise_sd, prior_sd = hyperparameters[i] # XXtr2[i], Xytr2[i], Llowers2[i] = self._update_hyperstate(XXtr2[i], XXtr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), bases[i]), Xytr2[i], Xytr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), y[:, i, ...]), Llowers2[i], (noise_sd/prior_sd)**2) # print i # print np.allclose(Llowers[i], Llowers2[i].transpose([0, 2, 1])) # print np.allclose(Xytr[i], Xytr2[i]) # #Removable -END- for i in xrange(self.state_dim + self.learn_reward): Llowers[i] = Llowers[i].transpose([0, 2, 1]) rewards = np.concatenate(rewards, axis=-1) rewards = np.sum(rewards, axis=-1) loss = -np.mean(rewards) np.random.set_state(rng_state) return loss except Exception as e: np.random.set_state(rng_state) print(e, 'Returning 10e100') return 10e100