def __init__(self, gamma=0.9, act_rand=0.3, r_max=1, h=10, w=10, n_trajs=100, l_traj=20, rand_start=True, learning_rate=0.02, n_iters=20, save_dir="./exps", exp_name="gw_" + str(int(time.time())), n_exp=20, feat_map=None, gpu_fraction=0.2, terminal=True): self._gamma, self._act_rand, self._r_max, self._h, self._w, self._n_trajs, self._l_traj, self._rand_start, \ self._learning_rate, self._n_iters, self._save_dir, self._exp_name, self._n_exp = \ gamma, act_rand, r_max, h, w, n_trajs, l_traj, rand_start, learning_rate, n_iters, save_dir, exp_name, n_exp self._exp_result_path = save_dir + "/" + exp_name if not os.path.exists(self._exp_result_path): os.makedirs(self._exp_result_path) else: logging.warning(self._exp_result_path + " has existed") exit() rmap_gt = np.zeros([h, w]) rmap_gt[h - 1, w - 1] = rmap_gt[0, w - 1] = rmap_gt[h - 1, 0] = r_max if terminal: self._gw = gridworld.GridWorld(rmap_gt, {(h - 1, w - 1), (0, w - 1), (h - 1, 0)}, 1 - ACT_RAND) else: self._gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) self._rewards_gt = np.reshape(rmap_gt, H * W, order='F') self._P_a = self._gw.get_transition_mat() ts = time.time() self._values_gt, self._policy_gt = value_iteration.value_iteration( self._P_a, self._rewards_gt, GAMMA, error=0.01, deterministic=True) te = time.time() print "value iteration time of ground truth: ", te - ts ts = time.time() self.save_plt("gt", (3 * w, h), self._rewards_gt, self._values_gt, self._policy_gt) te = time.time() print "saving plt time: ", te - ts self._demo_trajs = self.generate_demonstrations() self._feat_map = np.eye(h * w) if feat_map is None else feat_map self._gpu_fraction = gpu_fraction
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) print(feat_map.shape) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[0, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) feat_map_torch = torch.tensor(feat_map, dtype=torch.float) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print('Deep Max Ent IRL training ..') rewards = deep_maxent_irl(feat_map_torch, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #rewards = rewards.detach().numpy() values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): for seed in range(1): N_STATES = H * W # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) #goal coordinates rmap_gt[H - 1, W - 1] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration( P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(0) #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) trajs = mod.exp1_case2() rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #np.savetxt('results/rewards.txt', rewards) #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map', block=False) plt.plot() #now = datetime.datetime.now() #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png" figname = "results/rewards_seed{0}".format(seed) + ".png" plt.savefig(figname)
def main(): N_STATES = H * W N_ACTIONS = 4 rmap_gt = set_rewards2() gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) path_gt = gw.display_path_grid(policy_gt) # use identity matrix as feature ## feat_map = np.eye(N_STATES) feat_map = feature_histogram(gw) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print 'Deep Max Ent IRL training ..' rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) path = gw.display_path_grid(policy) # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 2, W - 2] = R_MAX rmap_gt[1, 1] = R_MAX # rmap_gt[H/2, W/2] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print('LP IRL training ..') rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX) print('Max Ent IRL training ..') rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2, N_ITERS * 2) print('Deep Max Ent IRL training ..') rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) print('Deep Siamese Max Ent IRL training ..') rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # plots plt.figure(figsize=(20, 5)) plt.subplot(1, 5, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 5, 2) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) plt.subplot(1, 5, 3) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) plt.subplot(1, 5, 4) img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'), 'Reward Map - Deep Maxent', block=False) plt.subplot(1, 5, 5) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Deep Policy Maxent', block=False) plt.show()
if __name__ == "__main__": height = 5 width = 5 N_s = height * width N_a = 4 # left right up and down R_max = 10 trans_prob = 0.7 # with 30% prob takes random action other than chosen gamma = 0.5 lmbda = 10 iterations = 100 grid = np.zeros((height, width)) grid[height - 1][width - 1] = R_max gw_mdp = gridworld.GridWorld(grid, {(height - 1, width - 1)}, trans_prob) R_mat = gw_mdp.get_reward_mat() # show rewards map show_heatmap(R_mat, 'Ground Truth of Reward') P = np.zeros((N_s, N_s, N_a)) # transition matrix for s_i in range(N_s): state_i = gw_mdp.idx2pos(s_i) for a in range(N_a): probabilities = gw_mdp.get_transition_states_and_probs(state_i, a) for state_j, prob in probabilities: s_j = gw_mdp.pos2idx(state_j) P[s_i, s_j, a] = prob
def main(): N_STATES = H * W N_ACTIONS = 4 rmap_gt = set_rewards() gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) path_gt = gw.display_path_grid(policy_gt) rmap_gt = gw.get_reward_mat() #temp plt.figure(figsize=(20, 4)) plt.subplot(1, 3, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 3, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 3, 3) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.show() sys.exit() # feat_map = np.eye(N_STATES) # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) feat_map = feature_histogram(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) path = gw.display_path_grid(policy) # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): """ Recover gridworld reward using linear programming IRL """ H = 10 W = 10 N_STATES = H * W N_ACTIONS = 5 # init the gridworld grid = [['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', str(R_MAX)]] gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND) # solve the MDP using value iteration vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100) r_mat = gw.get_reward_mat() print 'show rewards map. any key to continue' img_utils.heatmap2d(r_mat, 'Reward Map - Ground Truth') v_mat = gw.get_values_mat(vi.get_values()) print 'show values map. any key to continue' img_utils.heatmap2d(v_mat, 'Value Map - Ground Truth') # Construct transition matrix P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS)) for si in range(N_STATES): statei = gw.idx2pos(si) for a in range(N_ACTIONS): probs = gw.get_transition_states_and_probs(statei, a) for statej, prob in probs: sj = gw.pos2idx(statej) # Prob of si to sj given action a P_a[si, sj, a] = prob # display policy and value in gridworld just for debug use gw.display_policy_grid(vi.get_optimal_policy()) gw.display_value_grid(vi.values) # setup policy policy = np.zeros(N_STATES) for i in range(N_STATES): policy[i] = vi.get_action(gw.idx2pos(i)) # solve for the rewards rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX) # display recoverred rewards print 'show recoverred rewards map. any key to continue' img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered') img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered')
def main(): N_STATES = H * W N_ACTIONS = 5 start_coordinates = (pixel_locations[0]['location-lat'][0], pixel_locations[0]['location-long'][0]) end_coordinates = ( pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1], pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1]) rmap_gt = np.zeros([W, H]) rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX # rmap_gt[H/2, W/2] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature # feat_map = np.eye(N_STATES) coast_map = np.load('Feature Maps/small_maps/coast.npy') coast_map = np.reshape(coast_map, (600, 1)) forest_map = np.load('Feature Maps/small_maps/forest.npy') forest_map = np.reshape(coast_map, (600, 1)) land_map = np.load('Feature Maps/small_maps/land.npy') land_map = np.reshape(coast_map, (600, 1)) feat_map = np.hstack((coast_map, forest_map, land_map)) # populate trajectories trajs = [] terminal_state = end_coordinates for x in range(len(pixel_locations)): trajs.append([]) for i in range(len(pixel_locations[x]) - 1): loc = pixel_locations[x].iloc[i] next_loc = pixel_locations[x].iloc[i + 1] action = get_action(loc, next_loc) reward = rmap_gt[int(next_loc[0]), int(next_loc[1])] is_done = np.array_equal(next_loc, terminal_state) trajs[x].append( Step(cur_state=int(gw.pos2idx(loc)), action=action, next_state=int(gw.pos2idx(next_loc)), reward=reward, done=is_done)) print 'LP IRL training ..' rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX) print 'Max Ent IRL training ..' rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # print 'Deep Max Ent IRL training ..' # rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10) # plots fig = plt.figure() plt.subplot(1, 2, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) fig.savefig('GroundTruth.png') plt.subplot(1, 1, 1) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) fig.savefig('LP.png') plt.subplot(1, 1, 1) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) fig.savefig('MaxEnt.png')
def main(): N_STATES = H * W N_ACTIONS = 5 """while True: print "BAD_STATE入力" bad = raw_input('>> ') if bad == 'ok': break Bad_states.append(bad) """ #print Bad_states # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y) np.savetxt('results/rewards.txt', rewards) #print rewards values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) #print policy # plots plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map', block=False) plt.plot() plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Policy Map', block=False) plt.plot() plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[0, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') if ACT_RAND == 0: P_a = gw.get_transition_mat_deterministic() else: P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature #feat_map = np.eye(N_STATES) # feat_map = np.zeros(N_STATES).reshape((H, W)) feat_map = np.random.rand(N_STATES).reshape((H, W)) #feat_map = np.arange(N_STATES).reshape((H, W)) if ARGS.conv: #feat_map[H-1, W-1] = -5 #feat_map[0, W-1] = -5 #feat_map[H-1, 0] = -5 pass else: feat_map = feat_map.reshape(N_STATES) #feat_map = rmap_gt trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print 'Deep Max Ent IRL training ..' t = time.time() rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.conv, ARGS.sparse) print('time for dirl', time.time() - t) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) print( 'evd', value_iteration.expected_value_diff(P_a, rewards_gt, GAMMA, start_state_probs(trajs, N_STATES), values_gt, policy)) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): """ Recover gridworld reward using linear programming IRL """ H = 10 W = 10 N_STATES = H * W N_ACTIONS = 5 # init the gridworld including the reward grid = [ ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['-1', '-1', '-1', '-1', '-1', '0', '0', '-1', '-1', '-1'], ## ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '-1', '0', '0'], ['0', '0', '0', '0', '0', '0', '-1', '0', '0', '0'], ['0', '0', '0', '0', '0', '-1', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', str(R_MAX)] ] # custom for i, row in enumerate(grid): for j, e in enumerate(row): if e is '0': grid[i][j] = '-1' elif e is '-1': grid[i][j] = '-10' # grid, terminal state, trans_prob gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND) # solve the MDP using value iteration vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100) r_mat_gt = gw.get_reward_mat() v_mat_gt = gw.get_values_mat(vi.get_values()) # Construct transition matrix P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS)) for si in range(N_STATES): statei = gw.idx2pos(si) for a in range(N_ACTIONS): probs = gw.get_transition_states_and_probs(statei, a) for statej, prob in probs: sj = gw.pos2idx(statej) # Prob of si to sj given action a P_a[si, sj, a] = prob # display policy and value in gridworld just for debug use gw.display_policy_grid(vi.get_optimal_policy()) gw.display_value_grid(vi.values) # display a path following optimal policy ## print 'show optimal path. any key to continue' path_gt = gw.display_path_grid(vi.get_optimal_policy()) ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path') ## sys.exit() # setup policy policy = np.zeros(N_STATES) for i in range(N_STATES): policy[i] = vi.get_action(gw.idx2pos(i)) #------------------ After getting optimal policy through iterations ------------------ # solve for the rewards rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX) r_mat = np.reshape(rewards, (H, W), order='F') v_mat = gw.get_values_mat(vi.get_values()) path = gw.display_path_grid(vi.get_optimal_policy()) # display recoverred rewards print 'show recoverred rewards map. any key to continue' ## img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered') #img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered') # display a path following optimal policy print 'show optimal path. any key to continue' ## path = gw.display_path_grid(vi.get_optimal_policy()) ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path') # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(r_mat_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(v_mat_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(r_mat, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(v_mat, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') # P_a = gw.get_transition_mat( ) #this is the transitin probablities of the matrix 5 action what is the probability of moving from state s1 to s2 give the action #getting the transition probabilities in my case is just impossible ... values_gt, policy_gt = value_iteration.value_iteration( P_a, rewards_gt, GAMMA, error=0.01, deterministic=True ) #value iteration and policy acoding to the currrent rewards 0 # use identity matrix as feature feat_map = np.eye(N_STATES) #features as one hot encoding # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations( gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) #this is the trajectories rewards = maxent_irl( feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS ) #need to input the feature map , transition priobalibliteis og the world pdb.set_trace() values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): # named tuple to record demonstrations Step = namedtuple('Step','cur_state action next_state reward done') # argument parser for command line arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-wid', '--width', default=5, type=int, help='width of the gridworld') parser.add_argument('-hei', '--height', default=5, type=int, help='height of the gridworld') parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, help='learning rate') parser.add_argument('-l', '--l_traj', default=20, type=int, help='length of expert trajectory') parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', help='when sampling trajectories, fix start positions') parser.add_argument('--rand_start', dest='rand_start', action='store_true', help='when sampling trajectories, randomly pick start positions') parser.add_argument('--approx', dest='approx', action='store_true', help='flag to perform approximation of psa') parser.add_argument('-g', '--gamma', default=0.9, type=float, help='discount factor') parser.add_argument('-n', '--n_iters', default=20, type=int, help='number of iterations') parser.add_argument('-t', '--n_trajs', default=100, type=int, help='number of expert trajectories') parser.add_argument('-a', '--act_random', default=0.3, type=float, help='probability of acting randomly') # set default value for rand_start variable parser.set_defaults(rand_start=False) # parse and print arguments args = parser.parse_args() # arguments for environment and irl algorithm r_max = 1 gamma = args.gamma width = args.width height = args.height l_traj = args.l_traj approx = args.approx n_iters = args.n_iters n_trajs = args.n_trajs act_rand = args.act_random rand_start = args.rand_start learning_rate = args.learning_rate # variables for number of actions and states n_actions = 5 n_states = height * width # initialize the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([height, width]) rmap_gt[0, width-1] = r_max rmap_gt[height-1, 0] = r_max rmap_gt[height-1, width-1] = r_max # create grid world instance gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand) # get true rewards, state transition dynamics rewards_gt = np.reshape(rmap_gt, height*width, order='F') P_a_true = gw.get_transition_mat() trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start) # get approximation of state transition dynamics P_a_approx = np.zeros((n_states, n_states, n_actions)) for traj in trajs: for t in range(len(traj)): P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1 for s in range(n_states): for a in range(n_actions): if np.sum(P_a_approx[s,:,a]) != 0: P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a]) if approx: P_a = P_a_approx else: P_a = P_a_true # get true value function and policy from reward map values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(n_states) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start) # perform inverse reinforcement learning to get reward function rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters) values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # plots plt.figure(figsize=(20,4)) plt.subplot(2, 2, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 2, 2) img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 2, 3) img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 2, 4) img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False) plt.show() # plots for state transition dynamics plt.figure(figsize=(10,4)) plt.subplot(2, 1, 1) img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False) plt.subplot(2, 1, 2) img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False) plt.show()