values = evaluation.evaluate_policy(mdp, agent, initial_states=initial_states) stop = values[0] > -67. return stop else: return False ### PBO ################################## pbo = GradPBO(bellman_model=rho_regressor, q_model=q_regressor, steps_ahead=STEPS_AHEAD, discrete_actions=discrete_actions, gamma=mdp.gamma, optimizer="adam", state_dim=state_dim, action_dim=action_dim, incremental=INCREMENTAL, update_theta_every=UPDATE_EVERY, steps_per_theta_update=None, verbose=1, norm_value=NORM_VALUE, independent=INDEPENDENT) # term_condition=lambda v1, v2: # increment_base_termination(v1,v2,2,tol=1e-2)) #term_condition=lambda v1, v2: terminal_evaluation(v1,v2,1e-1)) def tmetric(theta): t = pbo.apply_bo(theta[0], n_times=STEPS_AHEAD) return q_regressor.get_k(t)
agent, initial_states=initial_states) stop = values[0] > -67. return stop else: return False ### PBO ################################## pbo = GradPBO(bellman_model=rho_regressor, q_model=q_regressor, steps_ahead=STEPS_AHEAD, discrete_actions=discrete_actions, gamma=mdp.gamma, optimizer="adam", state_dim=state_dim, action_dim=action_dim, incremental=INCREMENTAL, update_theta_every=UPDATE_EVERY, steps_per_theta_update=None, verbose=1, norm_value=NORM_VALUE, independent=INDEPENDENT) # term_condition=lambda v1, v2: # increment_base_termination(v1,v2,2,tol=1e-2)) #term_condition=lambda v1, v2: terminal_evaluation(v1,v2,1e-1)) def tmetric(theta): t = pbo.apply_bo(theta[0], n_times=STEPS_AHEAD) return q_regressor.get_k(t)
a = np.array([0., 3., 4.]).reshape(-1, 1) nexts = s + 1 r = np.array([-1., -5., 0.]) absorbing = np.array([0., 0., 0.]) discrete_actions = np.array([1, 2, 3]).reshape(-1, 1) # to be used for maximum estimate # ================================================================= INCREMENTAL = False NORM_VAL = 2 ST = 1 gpbo = GradPBO(bellman_model=lbpo, q_model=q_model, steps_ahead=ST, discrete_actions=discrete_actions, gamma=gamma, optimizer="adam", norm_value=NORM_VAL, state_dim=1, action_dim=1, incremental=INCREMENTAL) gpbo._make_additional_functions() assert np.allclose(bellmanop(rho, theta), gpbo.F_bellman_operator(theta)), \ '{}, {}'.format(bellmanop(rho, theta), gpbo.F_bellman_operator(theta)) assert np.allclose(lqr_reg(s, a, theta), gpbo.F_q(s, a, theta)) berr = gpbo.F_bellman_err(s, a, nexts, r, absorbing, theta, discrete_actions) tv = multi_step_ebop(s, a, r, nexts, absorbing,
q_model = LQRRegressor() # q-function s = np.array([1., 2., 3.]).reshape(-1, 1) a = np.array([0., 3., 4.]).reshape(-1, 1) nexts = s + 1 r = np.array([-1., -5., 0.]) absorbing = np.array([0., 0., 0.]) discrete_actions = np.array([1, 2, 3]).reshape(-1, 1) # to be used for maximum estimate # ================================================================= INCREMENTAL = False NORM_VAL = 2 ST = 1 gpbo = GradPBO(bellman_model=lbpo, q_model=q_model, steps_ahead=ST, discrete_actions=discrete_actions, gamma=gamma, optimizer="adam", norm_value=NORM_VAL, state_dim=1, action_dim=1, incremental=INCREMENTAL) gpbo._make_additional_functions() assert np.allclose(bellmanop(rho, theta), gpbo.F_bellman_operator(theta)), \ '{}, {}'.format(bellmanop(rho, theta), gpbo.F_bellman_operator(theta)) assert np.allclose(lqr_reg(s, a, theta), gpbo.F_q(s, a, theta)) berr = gpbo.F_bellman_err(s, a, nexts, r, absorbing, theta, discrete_actions) tv = multi_step_ebop(s, a, r, nexts, absorbing, discrete_actions, gamma, rho, theta, norm_value=NORM_VAL, incremental=INCREMENTAL, steps=ST)[0] assert np.allclose(berr, tv), '{}, {}'.format(berr, tv) print(tv) berr_grad = gpbo.F_grad_bellman_berr(s, a, nexts, r, absorbing, theta, discrete_actions)