def main(): m = 1 l = 1 b = 0.15 g = 9.81 dt = 0.005 goal = np.array([[np.pi], [0]]) x_limits = np.array([0, 2 * np.pi]) numPointsx = 51 dx = (x_limits[-1] - x_limits[0]) / (numPointsx - 1) x_dot_limits = np.array([-6.5, 6.5]) numPointsx_dot = 81 dx_dot = (x_dot_limits[-1] - x_dot_limits[0]) / (numPointsx_dot - 1) Q = np.array([[40, 0], [0, 0.02]]) R = 0.02 environment = pendulum(m, l, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) gamma = 0.99 x_grid = np.linspace(x_limits[0], x_limits[1], numPointsx) x_dot_grid = np.linspace(x_dot_limits[0], x_dot_limits[1], numPointsx_dot) u_limits = np.array([-15, 15]) numPointsu = 31 u_grid = np.linspace(u_limits[0], u_limits[1], numPointsu) num_iterations = 400 policy, V = ValueIterationSwingUp(environment, gamma, x_grid, x_dot_grid, u_grid, num_iterations) plt.imshow(np.reshape(V, (numPointsx, numPointsx_dot)).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Value Function') plt.colorbar() plt.show()
def main(): """ Initialize environments """ m = 1 mass_factor = 2.1 l = 1 length_factor = 1.1 b = 0.15 g = 9.81 dt = 0.005 goal = np.array([[np.pi], [0]]) x_limits = np.array([0, 6.2832]) numPointsx = 51 dx = (x_limits[-1] - x_limits[0]) / (numPointsx - 1) x_dot_limits = np.array([-6.5, 6.5]) numPointsx_dot = 81 dx_dot = (x_dot_limits[-1] - x_dot_limits[0]) / (numPointsx_dot - 1) Q = np.array([[40, 0], [0, 0.02]]) R = 1.0 test_policies = False environment = pendulum(m, l, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) environment_target = pendulum(m * mass_factor, l * length_factor, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) """ Learn an initial policy and value function """ gamma = 0.99 x_grid = np.linspace(x_limits[0], x_limits[1], numPointsx) x_dot_grid = np.linspace(x_dot_limits[0], x_dot_limits[1], numPointsx_dot) u_limits = np.array([-15, 15]) numPointsu = 121 u_grid = np.linspace(u_limits[0], u_limits[1], numPointsu) num_iterations = 600 code_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = '../data/GPTD' data_dir = os.path.join(code_dir, data_dir) print('Value Iteration for target domain') target_file = 'data_m_%.2f_l_%.2f.pkl' % (m * mass_factor, l * length_factor) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == target_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, target_file), 'rb')) policy_target = data[0] V_target = data[1] if (not fileFound): policy_target, V_target = ValueIterationSwingUp( environment_target, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_target, V_target), open(os.path.join(data_dir, target_file), 'wb')) print('Value Iteration in simulation') start_file = 'data_m_%.2f_l_%.2f.pkl' % (m, l) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == start_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, start_file), 'rb')) policy_start = data[0] V_start = data[1] if (not fileFound): policy_start, V_start = ValueIterationSwingUp(environment, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_start, V_start), open(os.path.join(data_dir, start_file), 'wb')) V_target = np.reshape(V_target, (numPointsx, numPointsx_dot)) V_start = np.reshape(V_start, (numPointsx, numPointsx_dot)) policy_target = np.reshape(policy_target, (numPointsx, numPointsx_dot)) policy_start = np.reshape(policy_start, (numPointsx, numPointsx_dot)) """ Test learned policies """ if (test_policies): policy_start_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_start) dyn_start = lambda t, s: environment_target.dynamics_continuous( s, policy_start_) int_start = ode(dyn_start).set_integrator('vode', method='bdf', with_jacobian=False) int_start.set_initial_value(np.array([[0], [0]]), 0) t_final = 10 trajectory_start = np.empty((2, int(t_final / dt))) num_steps = 0 while int_start.successful() and int_start.t < t_final: int_start.integrate(int_start.t + dt) trajectory_start[:, num_steps] = int_start.y[:, 0] num_steps += 1 trajectory_start = trajectory_start[:, 0:num_steps] plt.plot(trajectory_start[0, :], trajectory_start[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Bootstrapped Policy') plt.show() policy_target_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_target) dyn_target = lambda t, s: environment_target.dynamics_continuous( s, policy_target_) int_target = ode(dyn_target).set_integrator('vode', method='bdf', with_jacobian=False) int_target.set_initial_value(np.array([[0], [0]]), 0) trajectory_target = np.empty((2, int(t_final / dt))) num_steps = 0 while int_target.successful() and int_target.t < t_final: int_target.integrate(int_target.t + dt) trajectory_target[:, num_steps] = int_target.y[:, 0] num_steps += 1 trajectory_target = trajectory_target[:, 0:num_steps] plt.plot(trajectory_target[0, :], trajectory_target[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Target Policy') plt.show() """ GPTD """ sigma0 = 0.2 # sigmaf = 7.6156 sigmaf = 5.0738 # sigmal = np.array([[0.6345],[1.2656]], dtype=np.float64) sigmal = np.array([[1.2703], [1.8037], [5.1917]], dtype=np.float64) kernel = SqExpArd(sigmal, sigmaf) policy_target_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_target) policy_prior = lambda s: policy_target_(s.T)[:, np.newaxis] V_mu = RegularGridInterpolator((x_grid, x_dot_grid), V_start) V_mu_ = lambda s: V_mu(s.T)[:, np.newaxis] nu = (np.exp(-1) - 0.3) max_episode_length = 1000 num_episodes = 600 states = np.mgrid[x_grid[0]:(x_grid[-1] + dx):dx, x_dot_grid[0]:(x_dot_grid[-1] + dx_dot):dx_dot] states = np.concatenate((np.reshape(states[0,:,:], (1,states.shape[1]*states.shape[2])),\ np.reshape(states[1,:,:], (1,states.shape[1]*states.shape[2]))), axis=0) print('GPTD.. ') print('Initial mean error:%f' % np.mean(np.abs(V_target - V_start))) test_every = 50 num_runs = 5 test_error = np.empty((num_runs, int(num_episodes / test_every) + 1)) for i in range(num_runs): np.random.seed(i * 20) gptd = GPTD(environment_target, nu, sigma0, gamma, kernel, V_mu_) test_error_ = gptd.build_posterior(policy_prior, num_episodes, max_episode_length, \ test_every, states_V_target=(states, np.reshape(V_target, (states.shape[1],1)))) V_gptd = gptd.get_value_function(states) V_gptd = np.reshape(V_gptd, (numPointsx, numPointsx_dot)) print('Final mean error:%f' % np.mean(np.abs(V_target - V_gptd))) test_error_ = np.concatenate( (test_error_, np.array([np.mean(np.abs(V_gptd - V_target))]))) test_error[i, :] = test_error_ set_trace() """ Results """ plt.subplot(3, 1, 1) plt.imshow(np.abs(V_target - V_start).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Initial Diff') plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(np.abs(V_target - V_gptd).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Final Diff') plt.colorbar() plt.subplot(3, 1, 3) plt.scatter(gptd.D[0, :], gptd.D[1, :], marker='o', c='red') plt.xlim(x_limits[0], x_limits[1]) plt.xlabel('theta') plt.ylim(x_dot_limits[0], x_dot_limits[1]) plt.ylabel('theta-dot') plt.title('Dictionary Points') resultDirName = 'GPTD_run' run = -1 for root, dirs, files in os.walk(data_dir): for d in dirs: if (d.startswith(resultDirName)): extension = d.split(resultDirName)[-1] if (extension.isdigit() and int(extension) >= run): run = int(extension) run += 1 saveDirectory = os.path.join(data_dir, resultDirName + str(run)) os.mkdir(saveDirectory) set_trace() with open(os.path.join(saveDirectory, 'session_%d.pkl' % num_episodes), 'wb') as f_: dl.dump((test_error), f_) plt.savefig(os.path.join(saveDirectory, 'V_Diff.png')) plt.show() sns.tsplot(test_error) plt.xlabel('Episodes x%d' % test_every) plt.ylabel('Mean absolute error') plt.title('GPTD') plt.savefig(os.path.join(saveDirectory, 'Learning_Trend.png')) plt.show() set_trace()
def main(): """ Initialize environments """ m = 1 mass_factor = 2.1 l = 1 length_factor = 1.1 b = 0.15 g = 9.81 dt = 0.005 goal = np.array([[np.pi], [0]]) x_limits = np.array([0, 6.2832]) numPointsx = 51 dx = (x_limits[-1] - x_limits[0]) / (numPointsx - 1) x_dot_limits = np.array([-6.5, 6.5]) numPointsx_dot = 81 dx_dot = (x_dot_limits[-1] - x_dot_limits[0]) / (numPointsx_dot - 1) Q = np.array([[40, 0], [0, 0.02]]) R = 0.2 test_policies = False environment = pendulum(m, l, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) environment_target = pendulum(m * mass_factor, l * length_factor, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) """ Learn an initial policy and value function """ gamma = 0.99 x_grid = np.linspace(x_limits[0], x_limits[1], numPointsx) x_dot_grid = np.linspace(x_dot_limits[0], x_dot_limits[1], numPointsx_dot) u_limits = np.array([-15.0, 15.0]) numPointsu = 121 u_grid = np.linspace(u_limits[0], u_limits[1], numPointsu) num_iterations = 600 code_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = '../data/GPSARSA' data_dir = os.path.join(code_dir, data_dir) print('Value Iteration for target domain') target_file = 'data_m_%.2f_l_%.2f.pkl' % (m * mass_factor, l * length_factor) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == target_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, target_file), 'rb')) policy_target = data[0] V_target = data[1] if (not fileFound): policy_target, V_target = ValueIterationSwingUp( environment_target, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_target, V_target), open(os.path.join(data_dir, target_file), 'wb')) print('Value Iteration in simulation') start_file = 'data_m_%.2f_l_%.2f.pkl' % (m, l) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == start_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, start_file), 'rb')) policy_start = data[0] V_start = data[1] if (not fileFound): policy_start, V_start = ValueIterationSwingUp(environment, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_start, V_start), open(os.path.join(data_dir, start_file), 'wb')) V_target = np.reshape(V_target, (numPointsx, numPointsx_dot)) V_start = np.reshape(V_start, (numPointsx, numPointsx_dot)) policy_target = np.reshape(policy_target, (numPointsx, numPointsx_dot)) policy_start = np.reshape(policy_start, (numPointsx, numPointsx_dot)) """ Test learned policies """ if (test_policies): policy_start_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_start) dyn_start = lambda t, s: environment_target.dynamics_continuous( s, policy_start_) int_start = ode(dyn_start).set_integrator('vode', method='bdf', with_jacobian=False) int_start.set_initial_value(np.array([[0], [0]]), 0) t_final = 10 trajectory_start = np.empty((2, int(t_final / dt))) num_steps = 0 while int_start.successful() and int_start.t < t_final: int_start.integrate(int_start.t + dt) trajectory_start[:, num_steps] = int_start.y[:, 0] num_steps += 1 trajectory_start = trajectory_start[:, 0:num_steps] plt.plot(trajectory_start[0, :], trajectory_start[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Bootstrapped Policy') plt.show() policy_target_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_target) dyn_target = lambda t, s: environment_target.dynamics_continuous( s, policy_target_) int_target = ode(dyn_target).set_integrator('vode', method='bdf', with_jacobian=False) int_target.set_initial_value(np.array([[0], [0]]), 0) trajectory_target = np.empty((2, int(t_final / dt))) num_steps = 0 while int_target.successful() and int_target.t < t_final: int_target.integrate(int_target.t + dt) trajectory_target[:, num_steps] = int_target.y[:, 0] num_steps += 1 trajectory_target = trajectory_target[:, 0:num_steps] plt.plot(trajectory_target[0, :], trajectory_target[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Target Policy') plt.show() """ GPSARSA """ sigma0 = 0.2 sigmaf = 13.6596 sigmal = np.array([[0.5977], [1.9957], [5.7314]]) epsilon = 0.1 max_episode_length = 1000 num_episodes = 1000 kernel = SqExpArd(sigmal, sigmaf) states = np.mgrid[x_grid[0]:(x_grid[-1] + dx):dx, x_dot_grid[0]:(x_dot_grid[-1] + dx_dot):dx_dot] states = np.concatenate((np.reshape(states[0,:,:], (1,states.shape[1]*states.shape[2])),\ np.reshape(states[1,:,:], (1,states.shape[1]*states.shape[2]))), axis=0) V_mu = lambda s: RegularGridInterpolator( (x_grid, x_dot_grid), V_start)(s.T) Q_mu = buildQfromV( V_mu, environment, gamma, states, u_grid[np.newaxis, :]) # Q_mu is number_of_actions x number_of_states Q_mu = np.reshape(Q_mu.T, (numPointsx, numPointsx_dot, numPointsu)) Q_mu = RegularGridInterpolator((x_grid, x_dot_grid, u_grid), Q_mu) Q_mu_ = lambda s, a: Q_mu( np.concatenate((s, a + 0.0001 * (a[0, :] <= u_grid[0]) - 0.0001 * (a[0, :] >= u_grid[-1])), axis=0).T)[:, np.newaxis] policy_start_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_start) policy_prior = lambda s: policy_start_(s.T)[:, np.newaxis] numElementsInD = 1500 D = (np.concatenate((x_limits[0] + np.random.rand(1,numElementsInD)*(x_limits[-1] - x_limits[0]),\ x_dot_limits[0] + np.random.rand(1,numElementsInD)*(x_dot_limits[-1] - x_dot_limits[0])), axis=0), \ u_limits[0] + np.random.rand(1,numElementsInD)*(u_limits[-1] - u_limits[0])) # D = (D, policy_prior(D).T) gpsarsa = GPSARSA_fixedGrid(environment_target, u_limits[np.newaxis, :], sigma0, gamma, epsilon, kernel, D, Q_mu_, policy_prior) print('GPSARSA.. ') gpsarsa.build_policy_monte_carlo(num_episodes, max_episode_length) V_gpsarsa = gpsarsa.get_value_function(states) V_gpsarsa = np.reshape(V_gpsarsa, (numPointsx, numPointsx_dot)) print('Initial mean error:%f' % np.mean(np.abs(V_target - V_start))) print('Final mean error:%f' % np.mean(np.abs(V_target - V_gpsarsa))) set_trace() """ Results """ plt.subplot(3, 1, 1) plt.imshow(np.abs(V_target - V_start).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Initial Diff') plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(np.abs(V_target - V_gpsarsa).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Final Diff') plt.colorbar() plt.subplot(3, 1, 3) plt.scatter(gpsarsa.D[0, :], gpsarsa.D[1, :], marker='o', c='red') plt.xlim(x_limits[0], x_limits[1]) plt.xlabel('theta') plt.ylim(x_dot_limits[0], x_dot_limits[1]) plt.ylabel('theta-dot') plt.title('Dictionary Points') resultDirName = 'GPSARSA_fixedGrid_run' run = -1 for root, dirs, files in os.walk(data_dir): for d in dirs: if (d.startswith(resultDirName)): extension = d.split(resultDirName)[-1] if (extension.isdigit() and int(extension) >= run): run = int(extension) run += 1 saveDirectory = os.path.join(data_dir, resultDirName + str(run)) os.mkdir(saveDirectory) dl.dump_session(filename=os.path.join(saveDirectory, 'session_%d' % num_episodes)) plt.savefig(os.path.join(saveDirectory, 'V_Diff.png')) plt.show() set_trace()
def main(): m = 1 mass_factor = 1.7 l = 1 length_factor = 0.8 b = 0.15 g = 9.81 dt = 0.01 goal = np.array([[np.pi], [0]]) x_limits = np.array([0, 6.2832]) numPointsx = 51 dx = (x_limits[-1] - x_limits[0]) / (numPointsx - 1) x_dot_limits = np.array([-6.5, 6.5]) numPointsx_dot = 81 dx_dot = (x_dot_limits[-1] - x_dot_limits[0]) / (numPointsx_dot - 1) Q = np.array([[30, 0], [0, 1]]) R = 2.5 test_policies = False environment = pendulum(m, l, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) environment_target = pendulum(m * mass_factor, l * length_factor, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) gamma = 0.99 x_grid = np.linspace(x_limits[0], x_limits[1], numPointsx) x_dot_grid = np.linspace(x_dot_limits[0], x_dot_limits[1], numPointsx_dot) u_limits = np.array([-15, 15]) numPointsu = 121 u_grid = np.linspace(u_limits[0], u_limits[1], numPointsu) num_iterations = 600 code_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = '../data/VI' data_dir = os.path.join(code_dir, data_dir) print('Value Iteration for target domain') target_file = 'data_m_%.2f_l_%.2f.pkl' % (m * mass_factor, l * length_factor) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == target_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, target_file), 'rb')) policy_target = data[0] V_target = data[1] if (not fileFound): policy_target, V_target = ValueIterationSwingUp( environment_target, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_target, V_target), open(os.path.join(data_dir, target_file), 'wb')) print('Value Iteration in simulation') start_file = 'data_m_%.2f_l_%.2f.pkl' % (m, l) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == start_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, start_file), 'rb')) policy_start = data[0] V_start = data[1] if (not fileFound): policy_start, V_start = ValueIterationSwingUp(environment, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_start, V_start), open(os.path.join(data_dir, start_file), 'wb')) V_target = np.reshape(V_target, (numPointsx, numPointsx_dot)) V_start = np.reshape(V_start, (numPointsx, numPointsx_dot)) policy_target = np.reshape(policy_target, (numPointsx, numPointsx_dot)) policy_start = np.reshape(policy_start, (numPointsx, numPointsx_dot)) if (test_policies): policy_start_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_start) dyn_sim = lambda t, s: environment.dynamics_continuous( s, policy_start_) int_sim = ode(dyn_sim).set_integrator('vode', method='bdf', with_jacobian=False) int_sim.set_initial_value(np.array([[0], [0]]), 0) t_final = 10 trajectory_start = np.empty((2, int(t_final / dt) + 1)) num_steps = 0 while int_sim.successful() and int_sim.t < t_final: int_sim.integrate(int_sim.t + dt) trajectory_start[:, num_steps] = int_sim.y[:, 0] num_steps += 1 trajectory_start = trajectory_start[:, 0:num_steps] plt.plot(trajectory_start[0, :], trajectory_start[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Simulation Policy') plt.show() dyn_start = lambda t, s: environment_target.dynamics_continuous( s, policy_start_) int_start = ode(dyn_start).set_integrator('vode', method='bdf', with_jacobian=False) int_start.set_initial_value(np.array([[0], [0]]), 0) t_final = 18 trajectory_bootstrapped = np.empty((2, int(t_final / dt) + 1)) num_steps = 0 while int_start.successful() and int_start.t < t_final: int_start.integrate(int_start.t + dt) trajectory_bootstrapped[:, num_steps] = int_start.y[:, 0] num_steps += 1 trajectory_bootstrapped = trajectory_bootstrapped[:, 0:num_steps] plt.plot(trajectory_bootstrapped[0, :], trajectory_bootstrapped[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Bootstrapped Policy') plt.show() policy_target_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_target) dyn_target = lambda t, s: environment_target.dynamics_continuous( s, policy_target_) int_target = ode(dyn_target).set_integrator('vode', method='bdf', with_jacobian=False) int_target.set_initial_value(np.array([[0], [0]]), 0) t_final = 10 trajectory_target = np.empty((2, int(t_final / dt) + 1)) num_steps = 0 while int_target.successful() and int_target.t <= t_final: int_target.integrate(int_target.t + dt) trajectory_target[:, num_steps] = int_target.y[:, 0] num_steps += 1 trajectory_target = trajectory_target[:, 0:num_steps] plt.plot(trajectory_target[0, :], trajectory_target[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Target Policy') plt.show() plt.subplot(2, 1, 1) plt.imshow(np.abs(V_target).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Target Value Function') plt.colorbar() plt.subplot(2, 1, 2) plt.imshow(np.abs(policy_target).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Target policy') plt.colorbar() # plt.show() plt.subplot(2, 1, 1) plt.imshow(np.abs(V_start).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Initial Value Function') plt.colorbar() plt.subplot(2, 1, 2) plt.imshow(np.abs(policy_start).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Initial policy') plt.colorbar() # plt.show() """ Fit GP on the simulation value function and optimize hyperparameters """ print('Fitting a GP') x_grid = np.linspace(x_limits[0], x_limits[1], numPointsx) x_dot_grid = np.linspace(x_dot_limits[0], x_dot_limits[1], numPointsx_dot) u_limits = np.array([-15, 15]) numPointsu = 121 u_grid = np.linspace(u_limits[0], u_limits[1], numPointsu) states = np.mgrid[x_limits[0]:(x_grid[-1] + dx):dx, x_dot_grid[0]:(x_dot_grid[-1] + dx_dot):dx_dot] states = np.concatenate((np.reshape(states[0,:,:], (1,states.shape[1]*states.shape[2])),\ np.reshape(states[1,:,:], (1,states.shape[1]*states.shape[2]))), axis=0) V_mu = lambda s: RegularGridInterpolator( (x_grid, x_dot_grid), V_start)(s.T) Q_mu = buildQfromV( V_mu, environment, gamma, states, u_grid[np.newaxis, :]) # Q_mu is number_of_actions x number_of_states Q_mu = np.reshape(Q_mu.T, (numPointsx, numPointsx_dot, numPointsu)) Q_mu = RegularGridInterpolator((x_grid, x_dot_grid, u_grid), Q_mu) Q_mu_ = lambda s, a: Q_mu( np.concatenate((s, a + 0.0001 * (a[0, :] <= u_grid[0]) - 0.0001 * (a[0, :] >= u_grid[-1])), axis=0).T)[:, np.newaxis] num_points_to_sample = 1500 x_train = np.concatenate((x_limits[0] + (x_limits[1] - x_limits[0])*np.random.rand(1,num_points_to_sample),\ x_dot_limits[0] + (x_dot_limits[1] - x_dot_limits[0])*np.random.rand(1,num_points_to_sample),\ u_limits[0] + (u_limits[1] - u_limits[0])*np.random.rand(1,num_points_to_sample)), axis=0) y_train = Q_mu_(x_train[0:2, :], x_train[-1:, :]) kernel = GPy.kern.RBF(input_dim=3, variance=1, lengthscale=np.array([1, 1, 1]), ARD=True) model = GPy.models.GPRegression(X=x_train.T, \ Y=y_train,\ kernel=kernel) print('Optimizing hyper-parameters') model.optimize('lbfgsb') V_pred = model.predict(np.concatenate((states, \ np.reshape(policy_start, \ (1, numPointsx*numPointsx_dot))), axis=0).T)[0] err = np.mean(V_pred - np.reshape(V_start, (numPointsx * numPointsx_dot, 1))) print('Fit error after hyper-parameter optimization : %f' % err) set_trace()
def main(): """ Initialize environments """ m = 1 mass_factor = 1.7 l = 1 length_factor = 0.8 b = 0.15 g = 9.81 dt = 0.01 goal = np.array([[np.pi], [0]]) x_limits = np.array([0, 6.2832]) numPointsx = 51 dx = (x_limits[-1] - x_limits[0]) / (numPointsx - 1) x_dot_limits = np.array([-6.5, 6.5]) numPointsx_dot = 81 dx_dot = (x_dot_limits[-1] - x_dot_limits[0]) / (numPointsx_dot - 1) Q = np.array([[30, 0], [0, 1]]) R = 2.5 test_policies = False environment = pendulum(m, l, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) environment_target = pendulum(m * mass_factor, l * length_factor, b, g, dt, goal, x_limits, dx, x_dot_limits, dx_dot, Q, R) """ Learn an initial policy and value function """ gamma = 0.99 x_grid = np.linspace(x_limits[0], x_limits[1], numPointsx) x_dot_grid = np.linspace(x_dot_limits[0], x_dot_limits[1], numPointsx_dot) u_limits = np.array([-15, 15]) numPointsu = 121 u_grid = np.linspace(u_limits[0], u_limits[1], numPointsu) num_iterations = 600 code_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = '../data/GPSARSA' data_dir = os.path.join(code_dir, data_dir) print('Value Iteration for target domain') target_file = 'data_m_%.2f_l_%.2f.pkl' % (m * mass_factor, l * length_factor) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == target_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, target_file), 'rb')) policy_target = data[0] V_target = data[1] if (not fileFound): policy_target, V_target = ValueIterationSwingUp( environment_target, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_target, V_target), open(os.path.join(data_dir, target_file), 'wb')) print('Value Iteration in simulation') start_file = 'data_m_%.2f_l_%.2f.pkl' % (m, l) fileFound = False for root, dirs, files in os.walk(data_dir): for file in files: if (file.endswith('.pkl') and file == start_file): fileFound = True print('Relevant pre-computed data found!') data = pkl.load(open(os.path.join(data_dir, start_file), 'rb')) policy_start = data[0] V_start = data[1] if (not fileFound): policy_start, V_start = ValueIterationSwingUp(environment, gamma, x_grid, x_dot_grid, u_grid, num_iterations) pkl.dump((policy_start, V_start), open(os.path.join(data_dir, start_file), 'wb')) V_target = np.reshape(V_target, (numPointsx, numPointsx_dot)) V_start = np.reshape(V_start, (numPointsx, numPointsx_dot)) policy_target = np.reshape(policy_target, (numPointsx, numPointsx_dot)) policy_start = np.reshape(policy_start, (numPointsx, numPointsx_dot)) """ Test learned policies """ if (test_policies): policy_start_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_start) dyn_start = lambda t, s: environment_target.dynamics_continuous( s, policy_start_) int_start = ode(dyn_start).set_integrator('vode', method='bdf', with_jacobian=False) int_start.set_initial_value(np.array([[0], [0]]), 0) t_final = 10 trajectory_start = np.empty((2, int(t_final / dt))) num_steps = 0 while int_start.successful() and int_start.t < t_final: int_start.integrate(int_start.t + dt) trajectory_start[:, num_steps] = int_start.y[:, 0] num_steps += 1 trajectory_start = trajectory_start[:, 0:num_steps] plt.plot(trajectory_start[0, :], trajectory_start[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Bootstrapped Policy') plt.show() policy_target_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_target) dyn_target = lambda t, s: environment_target.dynamics_continuous( s, policy_target_) int_target = ode(dyn_target).set_integrator('vode', method='bdf', with_jacobian=False) int_target.set_initial_value(np.array([[0], [0]]), 0) trajectory_target = np.empty((2, int(t_final / dt))) num_steps = 0 while int_target.successful() and int_target.t < t_final: int_target.integrate(int_target.t + dt) trajectory_target[:, num_steps] = int_target.y[:, 0] num_steps += 1 trajectory_target = trajectory_target[:, 0:num_steps] plt.plot(trajectory_target[0, :], trajectory_target[1, :]) plt.scatter(np.pi, 0, c='red', marker='o') plt.xlabel('theta') plt.ylabel('theta-dot') plt.title('Target Policy') plt.show() """ GPSARSA """ sigma0 = 0.2 sigmaf = 13.6596 sigmal = np.array([[0.5977], [1.9957], [5.7314]]) # sigmaf = 20.8228 # sigmal = np.array([[0.6694],[1.4959],[7.0752]]) # sigmaf = 16.8202 # sigmal = np.array([[1.3087],[2.9121],[9.6583],[7.0756]]) nu = (sigmaf**2) * (np.exp(-1) - 0.36) epsilon = 0.1 max_episode_length = 5000 update_length = 1000 num_episodes = 2000 kernel = SqExpArd(sigmal, sigmaf) states = np.mgrid[x_grid[0]:(x_grid[-1] + dx):dx, x_dot_grid[0]:(x_dot_grid[-1] + dx_dot):dx_dot] states = np.concatenate((np.reshape(states[0,:,:], (1,states.shape[1]*states.shape[2])),\ np.reshape(states[1,:,:], (1,states.shape[1]*states.shape[2]))), axis=0) V_mu = lambda s: RegularGridInterpolator( (x_grid, x_dot_grid), V_start)(s.T) Q_mu = buildQfromV( V_mu, environment, gamma, states, u_grid[np.newaxis, :]) # Q_mu is number_of_actions x number_of_states Q_mu = np.reshape(Q_mu.T, (numPointsx, numPointsx_dot, numPointsu)) Q_mu = RegularGridInterpolator((x_grid, x_dot_grid, u_grid), Q_mu) Q_mu_ = lambda s, a: Q_mu( np.concatenate((s, a + 0.0001 * (a[0, :] <= u_grid[0]) - 0.0001 * (a[0, :] >= u_grid[-1])), axis=0).T)[:, np.newaxis] policy_start_ = RegularGridInterpolator((x_grid, x_dot_grid), policy_start) policy_prior = lambda s: policy_start_(s.T)[:, np.newaxis] print('GPSARSA.. ') print('Initial mean error:%f' % np.mean(np.abs(V_target - V_start))) update_every = 50 num_elements_inD = 800 num_runs = 1 test_value_error = np.empty( (num_runs, int(num_episodes / update_every) + 2)) test_value_var = np.empty((num_runs, int(num_episodes / update_every) + 2)) test_pos_error = np.empty((num_runs, int(num_episodes / update_every))) for i in range(num_runs): # np.random.seed(i*20) gpsarsa = GPSARSA_monteCarlo(env=environment_target, u_limits=u_limits[np.newaxis,:], \ nu=nu, sigma0=sigma0, gamma=gamma, epsilon=epsilon, kernel=kernel,\ sparseD_size = num_elements_inD, simulation_policy=policy_prior) test_value_error_, test_value_error_, test_pos_error_ = \ gpsarsa.build_policy_monte_carlo(num_episodes=num_episodes, max_episode_length=max_episode_length, \ update_every=update_every, update_length=update_length, \ states_V_target=(states, np.reshape(V_target, (states.shape[1],1)))) V_gpsarsa, policy_gpsarsa = gpsarsa.get_value_function(states) V_gpsarsa = np.reshape(V_gpsarsa, (numPointsx, numPointsx_dot)) test_value_error_ = np.concatenate( (test_value_error_, np.array([np.mean(np.abs(V_gpsarsa - V_target))]))) test_value_error_ = np.concatenate( (np.array([np.mean(np.abs(V_start - V_target))]), test_value_error_)) _, V_gpsarsa_var = gpsarsa.get_Q(states, policy_gpsarsa) test_value_var_ = np.concatenate((np.array([0]), test_value_var_)) test_value_var_ = np.concatenate( (test_value_var_, np.array([np.mean(np.abs(V_gpsarsa_var))]))) print('Final mean error:%f' % np.mean(np.abs(V_target - V_gpsarsa))) test_value_error[i, :] = test_value_error_ test_value_var[i, :] = test_value_var_ test_pos_error[i, :] = test_pos_error_ """ Results """ plt.subplot(3, 1, 1) plt.imshow(np.abs(V_target - V_start).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Initial Diff') plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(np.abs(V_target - V_gpsarsa).T, aspect='auto',\ extent=(x_limits[0], x_limits[1], x_dot_limits[1], x_dot_limits[0]), origin='upper') plt.ylabel('theta-dot') plt.xlabel('theta') plt.title('Final Diff') plt.colorbar() plt.subplot(3, 1, 3) plt.scatter(gpsarsa.D[0, :], gpsarsa.D[1, :], marker='o', c='red') plt.xlim(x_limits[0], x_limits[1]) plt.xlabel('theta') plt.ylim(x_dot_limits[0], x_dot_limits[1]) plt.ylabel('theta-dot') plt.title('Dictionary Points') resultDirName = 'GPSARSA_monteCarlo_run' run = -1 for root, dirs, files in os.walk(data_dir): for d in dirs: if (d.startswith(resultDirName)): extension = d.split(resultDirName)[-1] if (extension.isdigit() and int(extension) >= run): run = int(extension) run += 1 saveDirectory = os.path.join(data_dir, resultDirName + str(run)) os.mkdir(saveDirectory) with open(os.path.join(saveDirectory, 'session_%d.pkl' % num_episodes), 'wb') as f_: dl.dump((test_value_error, test_pos_error, gpsarsa), f_) plt.savefig(os.path.join(saveDirectory, 'V_Diff.png')) plt.show() set_trace() sns.tsplot(test_value_error) plt.xlabel('Episodes x%d' % update_every) plt.ylabel('Mean absolute error') plt.title('GPSARSA Monte-Carlo') plt.savefig(os.path.join(saveDirectory, 'Learning_Trend_Value_woMean.png')) plt.show() sns.tsplot(test_value_var) plt.xlabel('Episodes x%d' % update_every) plt.ylabel('Value variance') plt.title('GPSARSA Monte-Carlo') plt.savefig( os.path.join(saveDirectory, 'Learning_Trend_ValueVariance_woMean.png')) plt.show() sns.tsplot(test_pos_error) plt.xlabel('Episodes x%d' % update_every) plt.ylabel('Mean goal error') plt.title('GPSARSA Monte-Carlo') plt.savefig(os.path.join(saveDirectory, 'Learning_Trend_Pos_woMean.png')) set_trace()