discrete_actions = np.array([1., 2., 3.9], dtype=theano.config.floatX).reshape(-1, 1) # discretization of the actions # to be used for maximum estimate # print(s,a,nexts,r,discrete_actions) q_model = LQRRegressor(theta) # q-function pfpo = EmpiricalBellmanResidualMinimization(q_model=q_model, discrete_actions=discrete_actions, gamma=gamma, optimizer="adam", state_dim=1, action_dim=1) start = time() pfpo._make_additional_functions() print('compilation time: {}'.format(time() - start)) check_v(pfpo.F_q(s, a), lqr_reg(s, a, [q_model.theta.eval()])) print('\n--- checking bellman error') berr = pfpo.F_bellman_err(s, a, nexts, r, discrete_actions) tv = empirical_bop(s, a, r, nexts, discrete_actions, gamma, lqr_reg, [q_model.theta.eval()]) check_v(berr, tv, 1) print('\n--- checking gradient of the bellman error') berr_grad = pfpo.F_grad_bellman_berr(s, a, nexts, r, discrete_actions) eps = np.sqrt(np.finfo(float).eps) f = lambda x: empirical_bop(s, a, r, nexts, discrete_actions, gamma, lqr_reg, [x]) approx_grad = optimize.approx_fprime(q_model.theta.eval().ravel(), f, eps).reshape(berr_grad[0].shape) check_v(berr_grad, approx_grad, 1) print() print('--' * 30)