예제 #1
0
def test_reward():
    '''
    Test reward function by comparing to reward.m
    '''
    k = 2  # state dim
    m = np.random.rand(1, k)
    s = np.random.rand(k, k)
    s = s.dot(s.T)

    reward = ExponentialReward(k)
    W = reward.W.numpy()
    t = reward.t.numpy()

    M, S = reward.compute_reward(m, s)

    M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4)

    np.testing.assert_allclose(M, M_mat)
    np.testing.assert_allclose(S, S_mat)
예제 #2
0
def test_reward():
    '''
    Test reward function by comparing to reward.m
    '''
    k = 2  # state dim
    m = np.random.rand(1, k)
    s = np.random.randn(k, k)
    s = s.dot(s.T)

    reward = ExponentialReward(k)
    W = reward.W.data.cpu().numpy()
    t = reward.t.data.cpu().numpy()

    M, S = reward.compute_reward(
        torch.tensor(m).float().cuda(),
        torch.tensor(s).float().cuda())

    M_mat, _, _, S_mat = octave.reward(m.T, s, t.T, W, nout=4)
    import pdb
    pdb.set_trace()

    np.testing.assert_allclose(M.cpu().numpy(), M_mat)
    np.testing.assert_allclose(S.cpu().numpy(), S_mat)
예제 #3
0
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    r_new = np.zeros((T, 1))
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(maxiter=maxiter, restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=T_sim,
                                     verbose=True,
                                     SUBS=SUBS,
                                     render=True)

        # Since we had decide on the various parameters of the reward function
        # we might want to verify that it behaves as expected by inspection
        for i in range(len(X_new)):
            r_new[:, 0] = R.compute_reward(X_new[i, None, :-1],
                                           0.001 * np.eye(state_dim))[0]
        total_r = sum(r_new)
        _, _, r = pilco.predict(X_new[0, None, :-1], 0.001 * S_init, T)
        print("Total ", total_r, " Predicted: ", r)

        # Update dataset
        X = np.vstack((X, X_new))
        Y = np.vstack((Y, Y_new))
        pilco.mgpr.set_data((X, Y))