Exemplo n.º 1
0
    def _log_marginal_likelihood(self, thetas, X, y):
        try:
            length_scale, signal_sd, noise_sd, prior_sd = thetas

            noise_sd2 = np.sqrt(noise_sd**2 + self.c * prior_sd**2)

            basis = _basis(X, self.random_matrix, self.bias, self.basis_dim,
                           np.abs(length_scale), np.abs(signal_sd))
            N = len(basis.T)
            XX = np.matmul(basis.T, basis)
            Xy = np.matmul(basis.T, y)

            tmp0 = (noise_sd2 / prior_sd)**2 * np.eye(self.basis_dim) + XX

            Llower = la.cholesky(tmp0, lower=True)
            LinvXy = la.solve_triangular(Llower, Xy, lower=True)
            tmp = np.sum(np.square(LinvXy))

            s, logdet = np.linalg.slogdet(
                np.eye(self.basis_dim) + (prior_sd / noise_sd2)**2 * XX)
            if s != 1:
                print('logdet is <= 0. Returning 10e100.')
                return 10e100

            lml = .5 * (-N * np.log(noise_sd2**2) * self.output_dim -
                        logdet * self.output_dim +
                        (-np.sum(np.square(y)) + tmp) / noise_sd2**2)
            loss = -lml
            return loss
        except Exception as e:
            print('------------')
            print(e, 'Returning 10e100.')
            print('************')
            return 10e100
Exemplo n.º 2
0
def update_hyperstate(agent, hyperstate, hyperparameters, datum, dim,
                      learn_diff):
    state, action, reward, next_state, _ = [
        np.atleast_2d(np.copy(dat)) for dat in datum
    ]
    Llowers, Xy = [list(ele) for ele in hyperstate]
    assert len(Llowers) == len(hyperparameters)
    assert len(Xy) == len(hyperparameters)
    assert len(hyperparameters) == dim
    state_action = np.concatenate([state, action], axis=-1)
    y = np.concatenate(
        [next_state - state if learn_diff else next_state, reward],
        axis=-1)[..., :dim]

    for i in range(len(Llowers)):
        Llowers[i] = Llowers[i].transpose([0, 2, 1])
    for i, hp in zip(range(dim), hyperparameters):
        length_scale, signal_sd, noise_sd, prior_sd = hp
        basis = _basis(state_action, agent.random_matrices[i], agent.biases[i],
                       agent.basis_dims[i], length_scale, signal_sd)
        cholupdate(Llowers[i][0], basis[0].copy())
        Xy[i] += np.matmul(basis[:, None, :].transpose([0, 2, 1]),
                           y[:, None, :][..., i:i + 1])
    for i in range(len(Llowers)):
        Llowers[i] = Llowers[i].transpose([0, 2, 1])

    return [Llowers, Xy]
Exemplo n.º 3
0
    def _predict(self, X):
        basis = _basis(X, self.random_matrix, self.bias, self.basis_dim,
                       self.length_scale, self.signal_sd)

        predict_sigma = np.sum(np.square(
            la.solve_triangular(self.Llower, basis.T, lower=True)),
                               axis=0) * self.noise_sd**2 + self.noise_sd**2
        predict_sigma = predict_sigma[..., np.newaxis]
        tmp0 = la.solve_triangular(self.Llower, basis.T, lower=True).T
        tmp1 = la.solve_triangular(self.Llower, self.Xy, lower=True)
        predict_mu = np.matmul(tmp0, tmp1)

        return predict_mu, predict_sigma
Exemplo n.º 4
0
    def _update(self, X, y):
        assert len(X.shape) == 2
        assert len(y.shape) == 2
        assert X.shape[0] == y.shape[0]

        basis = _basis(X, self.random_matrix, self.bias, self.basis_dim,
                       self.length_scale, self.signal_sd)
        self.XX += np.matmul(basis.T, basis)
        self.Xy += np.matmul(basis.T, y)

        #TODO: perform a rank-1 cholesky update?
        self.Llower = la.cholesky(
            (self.noise_sd / self.prior_sd)**2 * np.eye(self.basis_dim) +
            self.XX,
            lower=True)
Exemplo n.º 5
0
 def _reward(self, state, action, sess, Llower, Xy, hyperparameters):
     basis = None
     if self.environment == 'Pendulum-v0' and self.learn_reward == 0:
         reward = self.reward_function.build_np(sess, state, action)
     elif self.environment == 'MountainCarContinuous-v0' and self.learn_reward == 0:
         reward = self.reward_function.build_np(state, action)
     else:
         state_action = np.concatenate([state, action], axis=-1)
         length_scale, signal_sd, noise_sd, prior_sd = hyperparameters
         basis = _basis(state_action, self.random_matrices[-1],
                        self.biases[-1], self.basis_dims[-1], length_scale,
                        signal_sd)
         basis = np.expand_dims(basis, axis=1)
         pred_mu, pred_sigma = self._predict(Llower, Xy, basis, noise_sd)
         if self.use_mean_reward == 1:
             pred_sigma = np.zeros_like(pred_sigma)
         reward = np.stack([
             np.random.normal(loc=loc, scale=scale)
             for loc, scale in zip(pred_mu, pred_sigma)
         ],
                           axis=0)
     return reward, basis
Exemplo n.º 6
0
    def _loss(self,
              thetas,
              X,
              Llowers,
              XXtr,
              Xytr,
              A=[],
              hyperparameters=None,
              sess=None):
        rng_state = np.random.get_state()
        X = np.copy(X)
        Llowers = [np.copy(ele) for ele in Llowers]
        XXtr = [np.copy(ele) for ele in XXtr]
        Xytr = [np.copy(ele) for ele in Xytr]
        hyperparameters = [np.copy(ele) for ele in hyperparameters]
        try:
            np.random.seed(2)

            rewards = []
            state = X
            for unroll_step in xrange(self.unroll_steps):
                action = self._forward(thetas,
                                       state,
                                       hyperstate=[Llowers, Xytr])
                reward, basis_reward = self._reward(state, action, sess,
                                                    Llowers[-1], Xytr[-1],
                                                    hyperparameters[-1])
                rewards.append((self.discount_factor**unroll_step) * reward)
                state_action = np.concatenate([state, action], axis=-1)

                means = []
                covs = []
                bases = []
                for i in xrange(self.state_dim):
                    length_scale, signal_sd, noise_sd, prior_sd = hyperparameters[
                        i]
                    basis = _basis(state_action, self.random_matrices[i],
                                   self.biases[i], self.basis_dims[i],
                                   length_scale, signal_sd)
                    basis = np.expand_dims(basis, axis=1)
                    bases.append(basis)
                    pred_mu, pred_sigma = self._predict(
                        Llowers[i], Xytr[i], basis, noise_sd)
                    means.append(pred_mu)
                    covs.append(pred_sigma)
                means = np.concatenate(means, axis=-1)
                covs = np.concatenate(covs, axis=-1)

                bases.append(basis_reward)

                state_ = np.stack([
                    np.random.multivariate_normal(mean=mean, cov=np.diag(cov))
                    for mean, cov in zip(means, covs)
                ],
                                  axis=0)
                state = state + state_ if self.learn_diff else state_
                if self.learn_diff == 0:
                    state_ = np.clip(state_, self.observation_space_low,
                                     self.observation_space_high)
                state = np.clip(state, self.observation_space_low,
                                self.observation_space_high)

                #                #Removable
                #                import copy
                #                Llowers2 = copy.deepcopy(Llowers)
                #                Xytr2 = copy.deepcopy(Xytr)
                #                XXtr2 = copy.deepcopy(XXtr)
                #                #Removable -END-

                if self.update_hyperstate == 1 or self.policy_use_hyperstate == 1:
                    y = np.concatenate([state_, reward],
                                       axis=-1)[..., :self.state_dim +
                                                self.learn_reward]
                    y = y[..., np.newaxis, np.newaxis]
                    for i in xrange(self.state_dim + self.learn_reward):
                        Llowers[i] = Llowers[i].transpose([0, 2, 1])
                    for i in xrange(self.state_dim + self.learn_reward):
                        for j in xrange(len(Llowers[i])):
                            cholupdate(Llowers[i][j], bases[i][j, 0].copy())
                        Xytr[i] += np.matmul(bases[i].transpose([0, 2, 1]),
                                             y[:, i, ...])


#                        #Removable
#                        _, _, noise_sd, prior_sd = hyperparameters[i]
#                        XXtr2[i], Xytr2[i], Llowers2[i] = self._update_hyperstate(XXtr2[i], XXtr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), bases[i]), Xytr2[i], Xytr2[i] + np.matmul(np.transpose(bases[i], [0, 2, 1]), y[:, i, ...]), Llowers2[i], (noise_sd/prior_sd)**2)
#                        print i
#                        print np.allclose(Llowers[i], Llowers2[i].transpose([0, 2, 1]))
#                        print np.allclose(Xytr[i], Xytr2[i])
#                        #Removable -END-

                    for i in xrange(self.state_dim + self.learn_reward):
                        Llowers[i] = Llowers[i].transpose([0, 2, 1])

            rewards = np.concatenate(rewards, axis=-1)
            rewards = np.sum(rewards, axis=-1)
            loss = -np.mean(rewards)
            np.random.set_state(rng_state)
            return loss
        except Exception as e:
            np.random.set_state(rng_state)
            print(e, 'Returning 10e100')
            return 10e100