Exemplo n.º 1
0
        if not hasattr(self, "eval_f"):
            T_s = T.fmatrix()
            T_a = T.fmatrix()
            self.eval_f = theano.function([T_s, T_a], self.model(T_s, T_a))
        return self.eval_f(s, a)


theta0 = np.array([6., 10.001], dtype=theano.config.floatX).reshape(1, -1)
# q_regressor = LQG_NN(2,1,layers=[4,4], activations=['tanh', 'sigmoid'])
q_regressor = LQG_Q(theta0)
##########################################

### PBO ##################################
pfpo = EmpiricalBellmanResidualMinimization(q_model=q_regressor,
                                            discrete_actions=discrete_actions,
                                            gamma=mdp.gamma,
                                            optimizer="Nadam",
                                            state_dim=state_dim,
                                            action_dim=action_dim)
state, actions, reward, next_states = split_dataset(dataset,
                                                    state_dim=state_dim,
                                                    action_dim=action_dim,
                                                    reward_dim=reward_dim)
history = pfpo.fit(state.astype(theano.config.floatX), actions.astype(theano.config.floatX),
                   next_states.astype(theano.config.floatX), reward.astype(theano.config.floatX),
                   batch_size=1, nb_epoch=3,
                   theta_metrics={'k': lambda theta: q_regressor.get_k(theta)})
##########################################
# Evaluate the final solution
initial_states = np.array([[1, 2, 5, 7, 10]]).T
values = evaluation.evaluate_policy(mdp, pfpo, initial_states=initial_states)
print(values)
Exemplo n.º 2
0
    gamma = 0.99
    theta = np.array([2., 0.2], dtype='float32').reshape(1, -1)

    s = np.array([1., 2., 3.], dtype=theano.config.floatX).reshape(-1, 1)
    a = np.array([0., 3., 4.], dtype=theano.config.floatX).reshape(-1, 1)
    nexts = (s + 1).copy()
    r = np.array([-1., -5., 0.], dtype=theano.config.floatX)
    discrete_actions = np.array([1., 2., 3.9], dtype=theano.config.floatX).reshape(-1,
                                                                                   1)  # discretization of the actions
    # to be used for maximum estimate
    # print(s,a,nexts,r,discrete_actions)

    q_model = LQRRegressor(theta)  # q-function

    pfpo = EmpiricalBellmanResidualMinimization(q_model=q_model,
                                                discrete_actions=discrete_actions,
                                                gamma=gamma, optimizer="adam",
                                                state_dim=1, action_dim=1)
    start = time()
    pfpo._make_additional_functions()
    print('compilation time: {}'.format(time() - start))

    check_v(pfpo.F_q(s, a), lqr_reg(s, a, [q_model.theta.eval()]))

    print('\n--- checking bellman error')
    berr = pfpo.F_bellman_err(s, a, nexts, r, discrete_actions)
    tv = empirical_bop(s, a, r, nexts, discrete_actions, gamma, lqr_reg, [q_model.theta.eval()])
    check_v(berr, tv, 1)

    print('\n--- checking gradient of the bellman error')
    berr_grad = pfpo.F_grad_bellman_berr(s, a, nexts, r, discrete_actions)
    eps = np.sqrt(np.finfo(float).eps)