예제 #1
0
def learn(alg, alg_params):
    mdp = LQR.generate(dimensions=1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return policy
예제 #2
0
def test_lqr():
    np.random.seed(1)
    mdp = LQR.generate(2)
    mdp.reset()
    for i in range(10):
        ns, r, ab, _ = mdp.step(np.random.rand(mdp.info.action_space.shape[0]))
    ns_test = np.array([12.35564605, 14.98996889])

    assert np.allclose(ns, ns_test)

    A = np.eye(3)
    B = np.array([[2 / 3, 0], [1 / 3, 1 / 3], [0, 2 / 3]])
    Q = np.array([[0.1, 0., 0.], [0., 0.9, 0.], [0., 0., 0.1]])
    R = np.array([[0.1, 0.], [0., 0.9]])
    mdp = LQR(A, B, Q, R, max_pos=11.0, max_action=0.5, episodic=True)
    mdp.reset()

    a_test = np.array([1.0, 0.3])
    ns, r, ab, _ = mdp.step(a_test)
    ns_test = np.array([10.23333333, 10.16666667, 10.1])
    assert np.allclose(ns, ns_test) and np.allclose(r, -107.917) and not ab

    a_test = np.array([0.4, -0.1])
    ns, r, ab, _ = mdp.step(a_test)
    ns_test = np.array([10.5, 10.26666667, 10.03333333])
    assert np.allclose(ns, ns_test) and np.allclose(
        r, -113.72311111111117) and not ab

    a_test = np.array([0.5, 0.6])
    ns, r, ab, _ = mdp.step(a_test)
    ns_test = np.array([10.83333333, 10.6, 10.36666667])
    assert np.allclose(ns, ns_test) and np.allclose(
        r, -116.20577777777778) and not ab

    a_test = np.array([0.3, -0.7])
    ns, r, ab, _ = mdp.step(a_test)
    ns_test = np.array([11.03333333, 10.53333333, 10.03333333])
    assert np.allclose(ns, ns_test) and np.allclose(r, -1210.0) and ab