Пример #1
0
        values = evaluation.evaluate_policy(mdp, agent,
                                            initial_states=initial_states)
        stop = values[0] > -67.
        return stop
    else:
        return False


### PBO ##################################
pbo = GradPBO(bellman_model=rho_regressor,
              q_model=q_regressor,
              steps_ahead=STEPS_AHEAD,
              discrete_actions=discrete_actions,
              gamma=mdp.gamma,
              optimizer="adam",
              state_dim=state_dim,
              action_dim=action_dim,
              incremental=INCREMENTAL,
              update_theta_every=UPDATE_EVERY,
              steps_per_theta_update=None,
              verbose=1,
              norm_value=NORM_VALUE,
              independent=INDEPENDENT)
              # term_condition=lambda v1, v2:
              # increment_base_termination(v1,v2,2,tol=1e-2))
              #term_condition=lambda v1, v2: terminal_evaluation(v1,v2,1e-1))


def tmetric(theta):
    t = pbo.apply_bo(theta[0], n_times=STEPS_AHEAD)
    return q_regressor.get_k(t)
Пример #2
0
                                            agent,
                                            initial_states=initial_states)
        stop = values[0] > -67.
        return stop
    else:
        return False


### PBO ##################################
pbo = GradPBO(bellman_model=rho_regressor,
              q_model=q_regressor,
              steps_ahead=STEPS_AHEAD,
              discrete_actions=discrete_actions,
              gamma=mdp.gamma,
              optimizer="adam",
              state_dim=state_dim,
              action_dim=action_dim,
              incremental=INCREMENTAL,
              update_theta_every=UPDATE_EVERY,
              steps_per_theta_update=None,
              verbose=1,
              norm_value=NORM_VALUE,
              independent=INDEPENDENT)
# term_condition=lambda v1, v2:
# increment_base_termination(v1,v2,2,tol=1e-2))
#term_condition=lambda v1, v2: terminal_evaluation(v1,v2,1e-1))


def tmetric(theta):
    t = pbo.apply_bo(theta[0], n_times=STEPS_AHEAD)
    return q_regressor.get_k(t)
Пример #3
0
a = np.array([0., 3., 4.]).reshape(-1, 1)
nexts = s + 1
r = np.array([-1., -5., 0.])
absorbing = np.array([0., 0., 0.])
discrete_actions = np.array([1, 2, 3]).reshape(-1, 1)
# to be used for maximum estimate

# =================================================================
INCREMENTAL = False
NORM_VAL = 2
ST = 1
gpbo = GradPBO(bellman_model=lbpo,
               q_model=q_model,
               steps_ahead=ST,
               discrete_actions=discrete_actions,
               gamma=gamma,
               optimizer="adam",
               norm_value=NORM_VAL,
               state_dim=1,
               action_dim=1,
               incremental=INCREMENTAL)
gpbo._make_additional_functions()
assert np.allclose(bellmanop(rho, theta), gpbo.F_bellman_operator(theta)), \
    '{}, {}'.format(bellmanop(rho, theta), gpbo.F_bellman_operator(theta))
assert np.allclose(lqr_reg(s, a, theta), gpbo.F_q(s, a, theta))

berr = gpbo.F_bellman_err(s, a, nexts, r, absorbing, theta, discrete_actions)
tv = multi_step_ebop(s,
                     a,
                     r,
                     nexts,
                     absorbing,
Пример #4
0
q_model = LQRRegressor()  # q-function

s = np.array([1., 2., 3.]).reshape(-1, 1)
a = np.array([0., 3., 4.]).reshape(-1, 1)
nexts = s + 1
r = np.array([-1., -5., 0.])
absorbing = np.array([0., 0., 0.])
discrete_actions = np.array([1, 2, 3]).reshape(-1, 1)
# to be used for maximum estimate

# =================================================================
INCREMENTAL = False
NORM_VAL = 2
ST = 1
gpbo = GradPBO(bellman_model=lbpo, q_model=q_model, steps_ahead=ST,
               discrete_actions=discrete_actions,
               gamma=gamma, optimizer="adam", norm_value=NORM_VAL,
               state_dim=1, action_dim=1, incremental=INCREMENTAL)
gpbo._make_additional_functions()
assert np.allclose(bellmanop(rho, theta), gpbo.F_bellman_operator(theta)), \
    '{}, {}'.format(bellmanop(rho, theta), gpbo.F_bellman_operator(theta))
assert np.allclose(lqr_reg(s, a, theta), gpbo.F_q(s, a, theta))

berr = gpbo.F_bellman_err(s, a, nexts, r, absorbing, theta, discrete_actions)
tv = multi_step_ebop(s, a, r, nexts, absorbing,
                     discrete_actions, gamma, rho, theta,
                     norm_value=NORM_VAL, incremental=INCREMENTAL, steps=ST)[0]
assert np.allclose(berr, tv), '{}, {}'.format(berr, tv)
print(tv)

berr_grad = gpbo.F_grad_bellman_berr(s, a, nexts, r, absorbing,
                                     theta, discrete_actions)