if not hasattr(self, "eval_f"): T_s = T.fmatrix() T_a = T.fmatrix() self.eval_f = theano.function([T_s, T_a], self.model(T_s, T_a)) return self.eval_f(s, a) theta0 = np.array([6., 10.001], dtype=theano.config.floatX).reshape(1, -1) # q_regressor = LQG_NN(2,1,layers=[4,4], activations=['tanh', 'sigmoid']) q_regressor = LQG_Q(theta0) ########################################## ### PBO ################################## pfpo = EmpiricalBellmanResidualMinimization(q_model=q_regressor, discrete_actions=discrete_actions, gamma=mdp.gamma, optimizer="Nadam", state_dim=state_dim, action_dim=action_dim) state, actions, reward, next_states = split_dataset(dataset, state_dim=state_dim, action_dim=action_dim, reward_dim=reward_dim) history = pfpo.fit(state.astype(theano.config.floatX), actions.astype(theano.config.floatX), next_states.astype(theano.config.floatX), reward.astype(theano.config.floatX), batch_size=1, nb_epoch=3, theta_metrics={'k': lambda theta: q_regressor.get_k(theta)}) ########################################## # Evaluate the final solution initial_states = np.array([[1, 2, 5, 7, 10]]).T values = evaluation.evaluate_policy(mdp, pfpo, initial_states=initial_states) print(values)
gamma = 0.99 theta = np.array([2., 0.2], dtype='float32').reshape(1, -1) s = np.array([1., 2., 3.], dtype=theano.config.floatX).reshape(-1, 1) a = np.array([0., 3., 4.], dtype=theano.config.floatX).reshape(-1, 1) nexts = (s + 1).copy() r = np.array([-1., -5., 0.], dtype=theano.config.floatX) discrete_actions = np.array([1., 2., 3.9], dtype=theano.config.floatX).reshape(-1, 1) # discretization of the actions # to be used for maximum estimate # print(s,a,nexts,r,discrete_actions) q_model = LQRRegressor(theta) # q-function pfpo = EmpiricalBellmanResidualMinimization(q_model=q_model, discrete_actions=discrete_actions, gamma=gamma, optimizer="adam", state_dim=1, action_dim=1) start = time() pfpo._make_additional_functions() print('compilation time: {}'.format(time() - start)) check_v(pfpo.F_q(s, a), lqr_reg(s, a, [q_model.theta.eval()])) print('\n--- checking bellman error') berr = pfpo.F_bellman_err(s, a, nexts, r, discrete_actions) tv = empirical_bop(s, a, r, nexts, discrete_actions, gamma, lqr_reg, [q_model.theta.eval()]) check_v(berr, tv, 1) print('\n--- checking gradient of the bellman error') berr_grad = pfpo.F_grad_bellman_berr(s, a, nexts, r, discrete_actions) eps = np.sqrt(np.finfo(float).eps)