learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205) pkl.dump(r, open("maxent_reward.pkl", 'wb')) return r if __name__ == '__main__': train(0.01, 1, 400, 0.01) rewards = pkl.load(open("maxent_reward.pkl", 'rb')) env = Env(prepare_tp=True) value = vi.value(env.get_policy(), env.n_states, env.transition_probability, rewards, 0.3) opt_value = vi.optimal_value(env.n_states, env.n_actions, env.transition_probability, rewards, 0.3) pkl.dump(value, open("maxent_value.pkl", 'wb')) pkl.dump(opt_value, open("maxent_opt_value.pkl", 'wb')) value = pkl.load(open("maxent_value.pkl", 'rb')) opt_value = pkl.load(open("maxent_opt_value.pkl", 'rb')) status = validate(value) print(status) pkl.dump(status, open("maxent_status.pkl", 'wb')) status = validate(opt_value) print(status) pkl.dump(status, open("maxent_opt_status.pkl", 'wb')) status = validate(rewards) print(status) pkl.dump(status, open("maxent_rewards_status.pkl", 'wb'))
def test_gw_once(grid_size, feature_map, n_samples, epochs, structure): """ Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature map feature_map with n_samples paths. grid_size: Grid size. int. feature_map: Which feature map to use. String in {ident, coord, proxi}. n_samples: Number of paths to sample. epochs: Number of epochs to run MaxEnt with. structure: Neural network structure tuple, e.g. (3, 3) would be a 3-layer neural network with assumed inputs. -> Expected value difference for MaxEnt, DeepMaxEnt """ # Basic gist of what we're doing here: Get the reward function using our # different IRL methods, use those to get a policy, evaluate that policy # using the true reward, and then return the difference in expected values. # Setup parameters. wind = 0.3 discount = 0.9 learning_rate = 0.01 trajectory_length = 3*grid_size # Make the gridworld and associated data. gw = Gridworld(grid_size, wind, discount) feature_matrix = gw.feature_matrix(feature_map) ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)]) optimal_policy = value_iteration.find_policy(gw.n_states, gw.n_actions, gw.transition_probability, ground_reward, discount).argmax(axis=1) trajectories = gw.generate_trajectories(n_samples, trajectory_length, optimal_policy.take) p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / trajectories.shape[0]) # True value. optimal_V = value_iteration.optimal_value(gw.n_states, gw.n_actions, gw.transition_probability, ground_reward, gw.discount) # MaxEnt reward; policy; value. maxent_reward = deep_maxent.irl((feature_matrix.shape[1],), feature_matrix, gw.n_actions, gw.discount, gw.transition_probability, trajectories, epochs, learning_rate) maxent_policy = value_iteration.find_policy(gw.n_states, gw.n_actions, gw.transition_probability, maxent_reward, discount).argmax(axis=1) maxent_V = value_iteration.value(maxent_policy, gw.n_states, gw.transition_probability, ground_reward, gw.discount) maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) # DeepMaxEnt reward; policy; value. deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure, feature_matrix, gw.n_actions, gw.discount, gw.transition_probability, trajectories, epochs, learning_rate) deep_maxent_policy = value_iteration.find_policy(gw.n_states, gw.n_actions, gw.transition_probability, deep_maxent_reward, discount).argmax(axis=1) deep_maxent_V = value_iteration.value(deep_maxent_policy, gw.n_states, gw.transition_probability, ground_reward, gw.discount) deep_maxent_EVD = (optimal_V.dot(p_start_state) - deep_maxent_V.dot(p_start_state)) plt.subplot(3, 3, 1) plt.pcolor(ground_reward.reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 2) plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) plt.title("MaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 3) plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 4) plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("Optimal policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 5) plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("MaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 6) plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("DeepMaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 7) plt.pcolor(optimal_V.reshape((grid_size, grid_size))) plt.title("Optimal value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 8) plt.pcolor(maxent_V.reshape((grid_size, grid_size))) plt.title("MaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 9) plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map, n_samples, epochs, structure, np.random.randint(10000000))) return maxent_EVD, deep_maxent_EVD
def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples, epochs, structure): """ Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature map feature_map with n_samples paths. grid_size: Grid size. int. n_objects: Number of objects. int. n_colours: Number of colours. int. discrete: Whether the features should be discrete. bool. l1: L1 regularisation. float. l2: L2 regularisation. float. n_samples: Number of paths to sample. epochs: Number of epochs to run MaxEnt with. structure: Neural network structure tuple, e.g. (3, 3) would be a 3-layer neural network with assumed inputs. -> Expected value difference for MaxEnt, DeepMaxEnt """ # Basic gist of what we're doing here: Get the reward function using our # different IRL methods, use those to get a policy, evaluate that policy # using the true reward, and then return the difference in expected values. # Setup parameters. wind = 0.3 discount = 0.9 learning_rate = 0.01 trajectory_length = 3 * grid_size # Make the objectworld and associated data. ow = Objectworld(grid_size, n_objects, n_colours, wind, discount) feature_matrix = ow.feature_matrix(discrete) ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)]) optimal_policy = value_iteration.find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_reward, discount).argmax(axis=1) trajectories = ow.generate_trajectories(n_samples, trajectory_length, optimal_policy.take) p_start_state = ( np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / trajectories.shape[0]) # True value. optimal_V = value_iteration.optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, ground_reward, ow.discount) # MaxEnt reward; policy; value. maxent_reward = deep_maxent.irl((feature_matrix.shape[1], ), feature_matrix, ow.n_actions, ow.discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions, ow.transition_probability, maxent_reward, discount).argmax(axis=1) maxent_V = value_iteration.value(maxent_policy, ow.n_states, ow.transition_probability, ground_reward, ow.discount) maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) # DeepMaxEnt reward; policy; value. deep_learning_rate = 0.005 # For the 32 x 32 experiments. deep_maxent_reward = deep_maxent.irl( (feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, ow.discount, ow.transition_probability, trajectories, epochs, deep_learning_rate, l1=l1, l2=l2) deep_maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions, ow.transition_probability, deep_maxent_reward, discount).argmax(axis=1) deep_maxent_V = value_iteration.value(deep_maxent_policy, ow.n_states, ow.transition_probability, ground_reward, ow.discount) deep_maxent_EVD = (optimal_V.dot(p_start_state) - deep_maxent_V.dot(p_start_state)) plt.subplot(3, 3, 1) plt.pcolor(ground_reward.reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 2) plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) plt.title("MaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 3) plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 4) plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("Optimal policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 5) plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("MaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 6) plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("DeepMaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 7) plt.pcolor(optimal_V.reshape((grid_size, grid_size))) plt.title("Optimal value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 8) plt.pcolor(maxent_V.reshape((grid_size, grid_size))) plt.title("MaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 9) plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format( grid_size, n_objects, n_colours, discrete, n_samples, epochs, structure, l1, l2, np.random.randint(10000000))) return maxent_EVD, deep_maxent_EVD
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state, wind=0.0, algo="maxnet", mdp="gridworld"): """ Run inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. start_state: start location to generate trajectory from algo: IRL algo to run (Currently, support maxnet and deep_maxnet) """ sx, sy = start_state trajectory_length = 8 if mdp == "objectworld": import irl.mdp.objectworld as objectworld ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) elif mdp == "gridworld": import irl.mdp.gridworld as gridworld ow = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) optimal_v = optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, normalize(ground_r), ow.discount) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s], random_start=True) feature_matrix = ow.feature_matrix() print("trajectories = ", trajectories.shape) print("epochs = ", epochs) print("feature_matrix.shape = ", feature_matrix.shape) print("policy.shape = ", policy.shape) # ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), value=optimal_v) ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), policy=policy, value=optimal_v) r = [] ground_svf = [] if algo == "maxent": import irl.maxent as maxent ground_svf = maxent.find_svf(ow.n_states, trajectories) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) elif algo == "deep_maxnet": import irl.deep_maxent as deep_maxent l1 = l2 = 0 structure = (3, 3) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, normalize(r), ow.discount, stochastic=False) recovered_v = value(recovered_policy, ow.n_states, ow.transition_probability, normalize(r), ow.discount) new_trajectory = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: recovered_policy[s], True, (sx, sy)) recovered_svf = maxent.find_svf(ow.n_states, new_trajectory) # ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), # value=recovered_v) ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), policy=recovered_policy, value=recovered_v) # print("new trajectory") # for t in new_trajectory: # for s, a, rw in t: # print (ow.int_to_point(s), ow.actions[a], rw) # print ("---------") y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5] plt.subplot(111) plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth SVF") plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size))) plt.title("Recovered SVF") plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size))) plt.title("Recovered reward") plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150)