def evaluate(mdp, fqi, initial_states, args): values = evaluation.evaluate_policy(mdp, fqi, initial_states=initial_states) iteration_values = list() results = list() print('J: %f' % values[0]) iteration_values.append(values[0]) results.append(values) if args.plot: if i == 1: fig1 = plt.figure(1) ax = fig1.add_subplot(1, 1, 1) h = ax.plot(range(i + 1), iteration_values, 'ro-') plt.ylim(min(iteration_values), max(iteration_values)) plt.xlim(0, i + 1) plt.ion() # turns on interactive mode plt.show() elif i > 1: h[0].set_data(range(i + 1), iteration_values) ax.figure.canvas.draw() plt.ylim(min(iteration_values), max(iteration_values)) plt.xlim(0, i + 1) plt.show() return results
def terminal_evaluation(old_theta, new_theta, tol_theta=1e-2): if increment_base_termination(old_theta, new_theta, 2, tol_theta): estimator = LQG_Q() estimator.omega = new_theta[0] agent = Algorithm(estimator, state_dim, action_dim, discrete_actions, mdp.gamma, mdp.horizon) agent._iteration = 1 initial_states = np.array([[1, 2, 5, 7, 10]]).T values = evaluation.evaluate_policy(mdp, agent, initial_states=initial_states) stop = values[0] > -67. return stop else: return False
def evaluate(mdp, fqi, initial_states, args): values = evaluation.evaluate_policy(mdp, fqi, initial_states=initial_states) iteration_values = list() iteration_values.append(values[0]) if args.plot: if i == 1: fig1 = plt.figure(1) ax = fig1.add_subplot(1, 1, 1) h = ax.plot(range(i + 1), iteration_values, 'ro-') plt.ylim(min(iteration_values), max(iteration_values)) plt.xlim(0, i + 1) plt.ion() # turns on interactive mode plt.show() elif i > 1: h[0].set_data(range(i + 1), iteration_values) ax.figure.canvas.draw() plt.ylim(min(iteration_values), max(iteration_values)) plt.xlim(0, i + 1) plt.show() return values
fit_params = {} # fit_params = { # "n_epochs": 300, # "batch_size": 50, # "validation_split": 0.1, # "verbosity": False, # "criterion": "mse" # } fqi.partial_fit(sast, r, **fit_params) iterations = 20 iteration_values = [] for i in range(iterations - 1): fqi.partial_fit(None, None, **fit_params) values = evaluation.evaluate_policy(mdp, fqi, initial_states=mdp.initial_states) print(values) iteration_values.append(values[0]) if i == 1: fig1 = plt.figure(1) ax = fig1.add_subplot(1, 1, 1) h = ax.plot(range(i + 1), iteration_values, 'ro-') plt.ylim(min(iteration_values), max(iteration_values)) plt.xlim(0, i + 1) plt.ion() # turns on interactive mode plt.show() elif i > 1: h[0].set_data(range(i + 1), iteration_values) ax.figure.canvas.draw() plt.ylim(min(iteration_values), max(iteration_values))
state, actions, reward, next_states, absorbing = split_dataset(dataset, state_dim=state_dim, action_dim=action_dim, reward_dim=reward_dim) theta0 = np.array([6., 10.001], dtype='float32').reshape(1, -1) # theta0 = np.array([16., 10.001], dtype='float32').reshape(1, -1) history = pbo.fit(state, actions, next_states, reward, absorbing, theta0, batch_size=10, nb_epoch=EPOCH, theta_metrics={'k': tmetric}) ########################################## # Evaluate the final solution initial_states = np.array([[1, 2, 5, 7, 10]]).T values = evaluation.evaluate_policy(mdp, pbo, initial_states=initial_states) print('Learned theta: {}'.format(pbo.learned_theta_value)) print('Final performance of PBO: {}'.format(values)) ########################################## # Some plot ks = np.array(history['k']).squeeze() weights = np.array(history['theta']).squeeze() print(weights.shape) plt.figure() plt.title('[train] evaluated weights') plt.scatter(weights[:, 0], weights[:, 1], s=50, c=np.arange(weights.shape[0]), cmap='viridis', linewidth='0') plt.xlabel('b') plt.ylabel('k')
estimator_rho=rho_regressor, state_dim=state_dim, action_dim=action_dim, discrete_actions=discrete_actions, gamma=mdp.gamma, learning_steps=50, batch_size=10, learning_rate=1e-1, incremental=INCREMENTAL, verbose=True) weights = pbo.fit(sast, r) ########################################## initial_states = np.array([[1, 2, 5, 7, 10]]).T values = evaluation.evaluate_policy(mdp, pbo, initial_states=initial_states) print(values) from matplotlib import pyplot as plt weights = np.array(weights) plt.subplot(1, 3, 1) plt.title('[train] evaluated weights') plt.xlabel('b') plt.ylabel('k') plt.scatter(weights[:, 1], weights[:, 0], s=50, c=np.arange(weights.shape[0]), cmap='inferno') plt.colorbar() best_rhos = pbo._rho_values[-1] ks = q_regressor._regressor.get_k(np.array(pbo._q_weights_list)) plt.subplot(1, 3, 2)
s_t = mdp.get_state() u_t = policy.draw_action(s_t, False, True) returns[j] += df * mdp.step(u_t)[1] df *= mdp.gamma return returns.mean(), 2 * returns.std() / np.sqrt(n_rep) ############################################################## # Compute the discounted reward n_rep = 1000 J = mdp.computeJ(K, S, n_random_x0=n_rep) print("K", K) pol = tmp_policy(K, S) Jsample = [] for i in range(n_rep): Jsample.append(evaluate.evaluate_policy(mdp, pol, initial_states=initial_state)[0]) #Jsample /= n_rep print("J", J, np.mean(Jsample), np.std(Jsample) / np.sqrt(n_rep) * 1.96) ############################################################## # Compute the q-function x = np.array([2]) u = np.array([0]) q_val, q_std = estimate_qvalue(mdp, x, u, policy=pol, ep_length=400, n_rep=n_rep) v = mdp.computeQFunction(x, u, K, S, n_rep) print("Q", q_val, q_std, v) ############################################################## # Plot the q-function xs = np.linspace(-mdp.max_pos, mdp.max_pos, 60)
from ifqi.algorithms.lspi import LSPI from ifqi.envs.utils import get_space_info from ifqi.evaluation import evaluation from ifqi.evaluation.utils import check_dataset, split_data_for_fqi from ifqi.models.linear import Linear from ifqi.models.regressor import Regressor mdp = env.CarOnHill() state_dim, action_dim, reward_dim = get_space_info(mdp) nextstate_idx = state_dim + action_dim + reward_dim reward_idx = action_dim + state_dim # dataset: s, a, r, s' dataset = evaluation.collect_episodes(mdp, n_episodes=500) check_dataset(dataset, state_dim, action_dim, reward_dim) regressor_params = dict(features=dict(name='poly', params=dict(degree=5))) regressor = Regressor(Linear, **regressor_params) lspi = LSPI(regressor, state_dim, action_dim, mdp.action_space.values, mdp.gamma) sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim) lspi.fit(sast, r) values = evaluation.evaluate_policy(mdp, lspi, initial_states=mdp.initial_states) print(values)