def test_reward(): ''' Test reward function by comparing to reward.m ''' k = 2 # state dim m = np.random.rand(1, k) s = np.random.rand(k, k) s = s.dot(s.T) reward = ExponentialReward(k) W = reward.W.numpy() t = reward.t.numpy() M, S = reward.compute_reward(m, s) M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4) np.testing.assert_allclose(M, M_mat) np.testing.assert_allclose(S, S_mat)
def test_reward(): ''' Test reward function by comparing to reward.m ''' k = 2 # state dim m = np.random.rand(1, k) s = np.random.randn(k, k) s = s.dot(s.T) reward = ExponentialReward(k) W = reward.W.data.cpu().numpy() t = reward.t.data.cpu().numpy() M, S = reward.compute_reward( torch.tensor(m).float().cuda(), torch.tensor(s).float().cuda()) M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4) import pdb pdb.set_trace() np.testing.assert_allclose(M.cpu().numpy(), M_mat) np.testing.assert_allclose(S.cpu().numpy(), S_mat)
for model in pilco.mgpr.models: model.likelihood.variance.assign(0.001) set_trainable(model.likelihood.variance, False) r_new = np.zeros((T, 1)) for rollouts in range(N): print("**** ITERATION no", rollouts, " ****") pilco.optimize_models(maxiter=maxiter, restarts=2) pilco.optimize_policy(maxiter=maxiter, restarts=2) X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS, render=True) # Since we had decide on the various parameters of the reward function # we might want to verify that it behaves as expected by inspection for i in range(len(X_new)): r_new[:, 0] = R.compute_reward(X_new[i, None, :-1], 0.001 * np.eye(state_dim))[0] total_r = sum(r_new) _, _, r = pilco.predict(X_new[0, None, :-1], 0.001 * S_init, T) print("Total ", total_r, " Predicted: ", r) # Update dataset X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) pilco.mgpr.set_data((X, Y))