epsilon = 10**(-12) samples = collect_samples(sim) if len(samples) < 5000: samples += collect_samples(sim) discount = .8 # construct a graph from the samples graph = pvf.construct_graph(samples, Simulator.states) basis = pvf.create_basis_function(graph, Simulator.states, Simulator.actions, k) policy = initialize_policy(0.0, discount, basis) final_policy, all_policies = lspi.lspi(maxiter, epsilon, samples, policy) value_policy = initialize_value_function_policy(sim) plt.figure() plt.subplot(2,2,1) approxV = display_qvalues(sim, final_policy) plt.title('Estimated Value Function') plt.subplot(2,2,2) display_qvalues(sim, final_policy, dim=1) plt.title('Estimated Value Function') #lt.subplot(1,2,2) #isplay_policy(sim, final_policy) #lt.show()
if (k, episode) not in data: data[(k, episode)] = [] samples = rooms.collect_samples(sim, maxepisodes=episode, maxsteps=max_steps) graph = pvf.construct_graph(samples, sim.states) try: basis = pvf.create_basis_function(graph, sim.states, sim.actions, k) except: print "Couldn't compute basis function for this data" continue policy = rooms.initialize_policy(0.0, discount, basis) final_policy = lspi.lspi(maxiter, epsilon, samples, policy)[0] for n in range(num_tries): execution_data = rooms.test_execution(sim, final_policy, maxsteps=max_steps) data[(k, episode)].append(execution_data) for episode in range(start_episode, end_episode+1, step_episode): for k in range(start_k, end_k+1, step_k): total_steps = 0 data_list = data.get((k, episode), []) for data_point in data_list: total_steps += data_point[2] if k not in final_data: