def main(): # Init Gym env = gym.make(ARGS.environment) OBS_S = env.observation_space.shape trajs = play(env) # use identity matrix as feature feat_map_np = env.reset() #feat_map_np = voxelize(feat_map_np) feat_map = torch.tensor(feat_map_np, dtype=torch.float) P_a = np.ones((210, 160, 3)) print 'Deep Max Ent IRL training ..' rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4)
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) print(feat_map.shape) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[0, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) feat_map_torch = torch.tensor(feat_map, dtype=torch.float) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print('Deep Max Ent IRL training ..') rewards = deep_maxent_irl(feat_map_torch, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #rewards = rewards.detach().numpy() values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): for seed in range(1): N_STATES = H * W # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) #goal coordinates rmap_gt[H - 1, W - 1] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration( P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(0) #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) trajs = mod.exp1_case2() rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #np.savetxt('results/rewards.txt', rewards) #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map', block=False) plt.plot() #now = datetime.datetime.now() #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png" figname = "results/rewards_seed{0}".format(seed) + ".png" plt.savefig(figname)
def test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map): # print( 'LP IRL training ..') # rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX) # print('Max Ent IRL training ..') # rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE*2, N_ITERS*2) # print('Deep Max Ent IRL training ..') # rewards_fc = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # print('Deep Policy Max Ent IRL training ..') rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20,8)) plt.subplot(1, 5, 1) img_utils.heatmap2d(to_plot(rmap_gt), 'Rewards Map - Ground Truth', block=False, text=False) plt.subplot(1, 5, 2) img_utils.heatmap2d(to_plot(rewards_lpirl), 'Reward Map - LP', block=False, text=False) plt.subplot(1, 5, 3) img_utils.heatmap2d(to_plot(rewards_maxent), 'Reward Map - Maxent', block=False, text=False) plt.subplot(1, 5, 4) img_utils.heatmap2d(to_plot(rewards_fc), 'Reward Map - Deep Maxent', block=False, text=False) plt.subplot(1, 5, 5) img_utils.heatmap2d(to_plot(rewards), 'Reward Siamese Map - Deep Maxent', block=False, text=False) plt.show()
def save_plt(self, name, figsize, rewards, values, policy): plt.figure(figsize=figsize) plt.subplot(1, 3, 1) img_utils.heatmap2d(np.reshape(rewards, (self._h, self._w), order='F'), 'Rewards Map', block=False) plt.subplot(1, 3, 2) img_utils.heatmap2d(np.reshape(values, (self._h, self._w), order='F'), 'Value Map', block=False) plt.subplot(1, 3, 3) img_utils.heatmap2d(np.reshape(policy, (self._h, self._w), order='F'), 'Policy Map', block=False) plt.savefig(self._exp_result_path + "/" + name + ".png") plt.close()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure): wind = 0.3 trajectory_length = 8 l1 = l2 = 0 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) print(ow.objects.keys()) rewards_gt = np.array([ow.reward(s) for s in range(ow.n_states)]) policy_gt = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, rewards_gt, ow.discount, stochastic=False) trajs = ow.generate_trajectories(N_TRAJS, L_TRAJ, lambda s: policy_gt[s]) feat_map = ow.feature_matrix(ow.objects, discrete=False) rewards_inv = np.array( [ow.inverse_reward(s_inv) for s_inv in range(ow.n_states)]) policy_inv = find_inverted_policy(ow.n_states, ow.n_actions, ow.transition_probability, rewards_inv, ow.discount, stochastic=False) trajs_inv = ow.generate_inverse_trajectories( N_TRAJS, L_TRAJ, lambda s_inv: policy_inv[s_inv]) feat_map_inv = ow.inv_feature_matrix(ow.inverted_objects, discrete=False) print('LP IRL training ..') rewards_lpirl = lp_irl(ow.transition_probability, policy_gt, gamma=0.3, l1=10, R_max=R_MAX) print('Max Ent IRL training ..') rewards_maxent = maxent_irl(feat_map, ow.transition_probability, GAMMA, trajs, LEARNING_RATE * 2, N_ITERS * 2) print('Deep Max Ent IRL training ..') rewards_deep = deep_maxent_irl(feat_map, ow.transition_probability, GAMMA, trajs, LEARNING_RATE, N_ITERS) print('Deep Siamese Max Ent IRL training ..') rewards = deep_siamese_maxent_irl(feat_map, feat_map_inv, ow.transition_probability, GAMMA, trajs, trajs_inv, LEARNING_RATE, N_ITERS) # plots plt.figure(figsize=(20, 5)) plt.subplot(1, 5, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 5, 2) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) plt.subplot(1, 5, 3) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) plt.subplot(1, 5, 4) img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'), 'Reward Map - Deep Maxent', block=False) plt.subplot(1, 5, 5) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Deep Siamese Maxent', block=False) plt.show()
def ada_irl_update(self): for episode in range(self.Episode): # initial observation observation = self.env.reset() # action = self.RL.choose_action(observation) states = observation eq_r = 0 eq_step = 0 while True and eq_step < len(self.IRL.expert) * 3: # print eq_step # fresh env self.env.render() action = self.RL.choose_action(observation) observation_, reward, done = self.env.step(action) reward = self.IRL.reward(observation_, ) self.RL.learn(observation, action, reward, observation_) observation = observation_ # action = action_ states = np.vstack((states, observation)) eq_r += reward eq_step += 1 if done: break # print states self.IRL.learn(states) print "Episode %d | Reward" % episode, eq_r # print self.IRL.reward_weight print 'Game Over' print self.RL.q_table reward_weight = self.IRL.reward_weight.reshape( [self.env.col, self.env.row]) expert_reward = self.IRL.reward_weight[self.expert] print reward_weight actual_reward = np.zeros_like(reward_weight) actual_reward[5, 5] = 10 actual_reward[6, 1] = -5 actual_reward[1, 6] = -5 actual_reward[4, 7] = -5 actual_reward[7, 4] = -5 print actual_reward plt.figure(figsize=(25, 10)) plt.subplot(1, 2, 1) img_utils.heatmap2d(actual_reward, 'Reward MAP - Ground Truth', block=False) plt.subplot(1, 2, 2) img_utils.heatmap2d(reward_weight, 'Reward MAP - ddlGAN', block=False) plt.show() img_utils.heatmap3d(reward_weight, 'Reward MAP - ddlGAN') plt.show() print expert_reward x = np.arange(len(self.IRL.expert)) plt.plot(x, expert_reward, 'r-', lw=5) plt.show()
for s1 in range(N_STATES): s = int(traj[i - 1]) action = int(act[s]) if P_a[s][action][s1] == 1: traj[i] = s1 for j in range(0, TRAJ_LEN, 1): x, y = int_to_point(traj[j]) x = int(x) y = int(y) traj_demo[y][x] = 1 for i in range(H): for j in range(W): if r[i+j] > 20: r[i+j] = 20 r[i+j] = 0 ''' plt.figure(figsize=(H, W)) img_utils.heatmap2d(np.reshape(r, (H, W)), 'Reward Map', block=False, text=False) plt.show() ''' plt.imshow(traj_demo) plt.show() plt.figure(figsize=(H, W)) img_utils.heatmap2d(np.reshape(value, (H, W)), 'Value', block=False, text=False) plt.show()'''
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards #""" rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX ACT_RAND = 0 gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') """ rmap_gt = np.zeros([H, W]) rmap_gt[H - 2, W - 2] = R_MAX rmap_gt[1, 1] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) """ # P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) np.random.seed(1) eg = [] tg = [] unseen = [] for i in range(59, 60): print("i = {}".format(i + 1)) N_TRAJS = 100 L_TRAJ = (i + 1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) #rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) rewards_ent, policy_ent, n_unseen = maxent_irl_ent(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, deterministic=True) value_ent = GetValue(policy_ent, P_a, rewards_gt, GAMMA, deterministic=True) _, policy_theta = value_iteration(P_a, rewards_ent, GAMMA, error=0.01, deterministic=True) value_theta = GetValue(policy_theta, P_a, rewards_gt, GAMMA, deterministic=True) eg.append( np.linalg.norm(value_ent - values_gt) / np.linalg.norm(values_gt)) tg.append( np.linalg.norm(value_theta - values_gt) / np.linalg.norm(values_gt)) unseen.append(n_unseen) unseen = np.array(unseen) plt.figure(1) plt.plot(eg, marker='.') plt.plot(tg, marker='.') #plt.plot((unseen / max(unseen)), marker='.') plt.grid(True) plt.ylabel('||Vgt - V||2/||Vgt||2') plt.xlabel('length_expert_demos') plt.legend(['V = Vent', 'V = Vtheta']) #, '#unseen']) plt.show() plt.figure(2) plt.plot(values_gt, marker='.') plt.plot(value_theta, marker='.') plt.plot(value_ent, marker='.') plt.grid(True) plt.ylabel('V') plt.xlabel('s') plt.legend(['Vgt', 'Vtheta', 'Vent']) plt.show() # plots plt.figure(figsize=(25, 5)) plt.subplot(1, 5, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 5, 2) img_utils.heatmap2d(np.reshape(rewards_ent, (H, W), order='F'), 'Reward_Ent - Recovered', block=False) plt.subplot(1, 5, 3) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value - Ground Truth', block=False) plt.subplot(1, 5, 4) img_utils.heatmap2d(np.reshape(value_ent, (H, W), order='F'), 'Value Ent - Recovered', block=False) plt.subplot(1, 5, 5) img_utils.heatmap2d(np.reshape(value_theta, (H, W), order='F'), 'Value Theta - Recovered', block=False) plt.show()
def main(): """ Recover gridworld reward using linear programming IRL """ H = 10 W = 10 N_STATES = H * W N_ACTIONS = 5 # init the gridworld grid = [['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', str(R_MAX)]] gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND) # solve the MDP using value iteration vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100) r_mat = gw.get_reward_mat() print 'show rewards map. any key to continue' img_utils.heatmap2d(r_mat, 'Reward Map - Ground Truth') v_mat = gw.get_values_mat(vi.get_values()) print 'show values map. any key to continue' img_utils.heatmap2d(v_mat, 'Value Map - Ground Truth') # Construct transition matrix P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS)) for si in range(N_STATES): statei = gw.idx2pos(si) for a in range(N_ACTIONS): probs = gw.get_transition_states_and_probs(statei, a) for statej, prob in probs: sj = gw.pos2idx(statej) # Prob of si to sj given action a P_a[si, sj, a] = prob # display policy and value in gridworld just for debug use gw.display_policy_grid(vi.get_optimal_policy()) gw.display_value_grid(vi.values) # setup policy policy = np.zeros(N_STATES) for i in range(N_STATES): policy[i] = vi.get_action(gw.idx2pos(i)) # solve for the rewards rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX) # display recoverred rewards print 'show recoverred rewards map. any key to continue' img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered') img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered')
def main(): N_STATES = H * W N_ACTIONS = 4 rmap_gt = set_rewards() gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) path_gt = gw.display_path_grid(policy_gt) rmap_gt = gw.get_reward_mat() #temp plt.figure(figsize=(20, 4)) plt.subplot(1, 3, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 3, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 3, 3) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.show() sys.exit() # feat_map = np.eye(N_STATES) # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) feat_map = feature_histogram(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) path = gw.display_path_grid(policy) # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
if is_plt: if is_v: # gridworld '''single or multiple destination''' terminal_single = traj[len(traj)-1] terminal_single2list = [] terminal_single2list.append(terminal_single) value, policy = vi.value_iteration(SHAPE, r, discount, terminal) # value = np.exp(value) '''single or multiple start point''' mu_exp = compute_state_visitation_freq(SHAPE, traj, policy) #mu_exp = compute_state_visitation_freq_multiple_starts(SHAPE, traj, start, policy) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(mu_exp, (H, W)), 'Expected SVF', block=False, text=False) plt.subplot(2, 4, 6) img_utils.heatmap2d(np.reshape(value, (H, W)), 'Value', block=False, text=False) plt.subplot(2, 4, 4) alpha = 0.8 integrated_weighted = alpha * 800 * np.reshape(mu_exp, [H, W]) + (1 - alpha) * np.reshape(r, (H, W)) img_utils.heatmap2d(integrated_weighted, 'Integrated map', block=False, text=False) plt.subplot(2, 4, 1) img_utils.heatmap2d(np.reshape(ref, [H, W]), 'Expert SVF', block=False, text=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(traj_svf, 'Trajectory SVF', block=False, text=False)
def main(): N_STATES = H * W N_ACTIONS = 5 start_coordinates = (pixel_locations[0]['location-lat'][0], pixel_locations[0]['location-long'][0]) end_coordinates = ( pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1], pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1]) rmap_gt = np.zeros([W, H]) rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX # rmap_gt[H/2, W/2] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature # feat_map = np.eye(N_STATES) coast_map = np.load('Feature Maps/small_maps/coast.npy') coast_map = np.reshape(coast_map, (600, 1)) forest_map = np.load('Feature Maps/small_maps/forest.npy') forest_map = np.reshape(coast_map, (600, 1)) land_map = np.load('Feature Maps/small_maps/land.npy') land_map = np.reshape(coast_map, (600, 1)) feat_map = np.hstack((coast_map, forest_map, land_map)) # populate trajectories trajs = [] terminal_state = end_coordinates for x in range(len(pixel_locations)): trajs.append([]) for i in range(len(pixel_locations[x]) - 1): loc = pixel_locations[x].iloc[i] next_loc = pixel_locations[x].iloc[i + 1] action = get_action(loc, next_loc) reward = rmap_gt[int(next_loc[0]), int(next_loc[1])] is_done = np.array_equal(next_loc, terminal_state) trajs[x].append( Step(cur_state=int(gw.pos2idx(loc)), action=action, next_state=int(gw.pos2idx(next_loc)), reward=reward, done=is_done)) print 'LP IRL training ..' rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX) print 'Max Ent IRL training ..' rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # print 'Deep Max Ent IRL training ..' # rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10) # plots fig = plt.figure() plt.subplot(1, 2, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) fig.savefig('GroundTruth.png') plt.subplot(1, 1, 1) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) fig.savefig('LP.png') plt.subplot(1, 1, 1) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) fig.savefig('MaxEnt.png')
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') # P_a = gw.get_transition_mat( ) #this is the transitin probablities of the matrix 5 action what is the probability of moving from state s1 to s2 give the action #getting the transition probabilities in my case is just impossible ... values_gt, policy_gt = value_iteration.value_iteration( P_a, rewards_gt, GAMMA, error=0.01, deterministic=True ) #value iteration and policy acoding to the currrent rewards 0 # use identity matrix as feature feat_map = np.eye(N_STATES) #features as one hot encoding # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations( gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) #this is the trajectories rewards = maxent_irl( feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS ) #need to input the feature map , transition priobalibliteis og the world pdb.set_trace() values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 2, W - 2] = R_MAX rmap_gt[1, 1] = R_MAX # rmap_gt[H/2, W/2] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print('LP IRL training ..') rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX) print('Max Ent IRL training ..') rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2, N_ITERS * 2) print('Deep Max Ent IRL training ..') rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) print('Deep Siamese Max Ent IRL training ..') rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # plots plt.figure(figsize=(20, 5)) plt.subplot(1, 5, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 5, 2) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) plt.subplot(1, 5, 3) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) plt.subplot(1, 5, 4) img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'), 'Reward Map - Deep Maxent', block=False) plt.subplot(1, 5, 5) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Deep Policy Maxent', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 """while True: print "BAD_STATE入力" bad = raw_input('>> ') if bad == 'ok': break Bad_states.append(bad) """ #print Bad_states # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y) np.savetxt('results/rewards.txt', rewards) #print rewards values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) #print policy # plots plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map', block=False) plt.plot() plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Policy Map', block=False) plt.plot() plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[0, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') if ACT_RAND == 0: P_a = gw.get_transition_mat_deterministic() else: P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature #feat_map = np.eye(N_STATES) # feat_map = np.zeros(N_STATES).reshape((H, W)) feat_map = np.random.rand(N_STATES).reshape((H, W)) #feat_map = np.arange(N_STATES).reshape((H, W)) if ARGS.conv: #feat_map[H-1, W-1] = -5 #feat_map[0, W-1] = -5 #feat_map[H-1, 0] = -5 pass else: feat_map = feat_map.reshape(N_STATES) #feat_map = rmap_gt trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print 'Deep Max Ent IRL training ..' t = time.time() rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.conv, ARGS.sparse) print('time for dirl', time.time() - t) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) print( 'evd', value_iteration.expected_value_diff(P_a, rewards_gt, GAMMA, start_state_probs(trajs, N_STATES), values_gt, policy)) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): """ Recover gridworld reward using linear programming IRL """ H = 10 W = 10 N_STATES = H * W N_ACTIONS = 5 # init the gridworld including the reward grid = [ ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['-1', '-1', '-1', '-1', '-1', '0', '0', '-1', '-1', '-1'], ## ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '-1', '0', '0'], ['0', '0', '0', '0', '0', '0', '-1', '0', '0', '0'], ['0', '0', '0', '0', '0', '-1', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', str(R_MAX)] ] # custom for i, row in enumerate(grid): for j, e in enumerate(row): if e is '0': grid[i][j] = '-1' elif e is '-1': grid[i][j] = '-10' # grid, terminal state, trans_prob gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND) # solve the MDP using value iteration vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100) r_mat_gt = gw.get_reward_mat() v_mat_gt = gw.get_values_mat(vi.get_values()) # Construct transition matrix P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS)) for si in range(N_STATES): statei = gw.idx2pos(si) for a in range(N_ACTIONS): probs = gw.get_transition_states_and_probs(statei, a) for statej, prob in probs: sj = gw.pos2idx(statej) # Prob of si to sj given action a P_a[si, sj, a] = prob # display policy and value in gridworld just for debug use gw.display_policy_grid(vi.get_optimal_policy()) gw.display_value_grid(vi.values) # display a path following optimal policy ## print 'show optimal path. any key to continue' path_gt = gw.display_path_grid(vi.get_optimal_policy()) ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path') ## sys.exit() # setup policy policy = np.zeros(N_STATES) for i in range(N_STATES): policy[i] = vi.get_action(gw.idx2pos(i)) #------------------ After getting optimal policy through iterations ------------------ # solve for the rewards rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX) r_mat = np.reshape(rewards, (H, W), order='F') v_mat = gw.get_values_mat(vi.get_values()) path = gw.display_path_grid(vi.get_optimal_policy()) # display recoverred rewards print 'show recoverred rewards map. any key to continue' ## img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered') #img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered') # display a path following optimal policy print 'show optimal path. any key to continue' ## path = gw.display_path_grid(vi.get_optimal_policy()) ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path') # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(r_mat_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(v_mat_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(r_mat, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(v_mat, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 4 rmap_gt = set_rewards2() gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) path_gt = gw.display_path_grid(policy_gt) # use identity matrix as feature ## feat_map = np.eye(N_STATES) feat_map = feature_histogram(gw) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print 'Deep Max Ent IRL training ..' rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) path = gw.display_path_grid(policy) # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): # named tuple to record demonstrations Step = namedtuple('Step','cur_state action next_state reward done') # argument parser for command line arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-wid', '--width', default=5, type=int, help='width of the gridworld') parser.add_argument('-hei', '--height', default=5, type=int, help='height of the gridworld') parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, help='learning rate') parser.add_argument('-l', '--l_traj', default=20, type=int, help='length of expert trajectory') parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', help='when sampling trajectories, fix start positions') parser.add_argument('--rand_start', dest='rand_start', action='store_true', help='when sampling trajectories, randomly pick start positions') parser.add_argument('--approx', dest='approx', action='store_true', help='flag to perform approximation of psa') parser.add_argument('-g', '--gamma', default=0.9, type=float, help='discount factor') parser.add_argument('-n', '--n_iters', default=20, type=int, help='number of iterations') parser.add_argument('-t', '--n_trajs', default=100, type=int, help='number of expert trajectories') parser.add_argument('-a', '--act_random', default=0.3, type=float, help='probability of acting randomly') # set default value for rand_start variable parser.set_defaults(rand_start=False) # parse and print arguments args = parser.parse_args() # arguments for environment and irl algorithm r_max = 1 gamma = args.gamma width = args.width height = args.height l_traj = args.l_traj approx = args.approx n_iters = args.n_iters n_trajs = args.n_trajs act_rand = args.act_random rand_start = args.rand_start learning_rate = args.learning_rate # variables for number of actions and states n_actions = 5 n_states = height * width # initialize the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([height, width]) rmap_gt[0, width-1] = r_max rmap_gt[height-1, 0] = r_max rmap_gt[height-1, width-1] = r_max # create grid world instance gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand) # get true rewards, state transition dynamics rewards_gt = np.reshape(rmap_gt, height*width, order='F') P_a_true = gw.get_transition_mat() trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start) # get approximation of state transition dynamics P_a_approx = np.zeros((n_states, n_states, n_actions)) for traj in trajs: for t in range(len(traj)): P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1 for s in range(n_states): for a in range(n_actions): if np.sum(P_a_approx[s,:,a]) != 0: P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a]) if approx: P_a = P_a_approx else: P_a = P_a_true # get true value function and policy from reward map values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(n_states) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start) # perform inverse reinforcement learning to get reward function rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters) values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # plots plt.figure(figsize=(20,4)) plt.subplot(2, 2, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 2, 2) img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 2, 3) img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 2, 4) img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False) plt.show() # plots for state transition dynamics plt.figure(figsize=(10,4)) plt.subplot(2, 1, 1) img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False) plt.subplot(2, 1, 2) img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False) plt.show()