Exemplo n.º 1
0
 def __init__(self,
              gamma=0.9,
              act_rand=0.3,
              r_max=1,
              h=10,
              w=10,
              n_trajs=100,
              l_traj=20,
              rand_start=True,
              learning_rate=0.02,
              n_iters=20,
              save_dir="./exps",
              exp_name="gw_" + str(int(time.time())),
              n_exp=20,
              feat_map=None,
              gpu_fraction=0.2,
              terminal=True):
     self._gamma, self._act_rand, self._r_max, self._h, self._w, self._n_trajs, self._l_traj, self._rand_start, \
     self._learning_rate, self._n_iters, self._save_dir, self._exp_name, self._n_exp = \
       gamma, act_rand, r_max, h, w, n_trajs, l_traj, rand_start, learning_rate, n_iters, save_dir, exp_name, n_exp
     self._exp_result_path = save_dir + "/" + exp_name
     if not os.path.exists(self._exp_result_path):
         os.makedirs(self._exp_result_path)
     else:
         logging.warning(self._exp_result_path + " has existed")
         exit()
     rmap_gt = np.zeros([h, w])
     rmap_gt[h - 1, w - 1] = rmap_gt[0, w - 1] = rmap_gt[h - 1, 0] = r_max
     if terminal:
         self._gw = gridworld.GridWorld(rmap_gt,
                                        {(h - 1, w - 1), (0, w - 1),
                                         (h - 1, 0)}, 1 - ACT_RAND)
     else:
         self._gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
     self._rewards_gt = np.reshape(rmap_gt, H * W, order='F')
     self._P_a = self._gw.get_transition_mat()
     ts = time.time()
     self._values_gt, self._policy_gt = value_iteration.value_iteration(
         self._P_a, self._rewards_gt, GAMMA, error=0.01, deterministic=True)
     te = time.time()
     print "value iteration time of ground truth: ", te - ts
     ts = time.time()
     self.save_plt("gt", (3 * w, h), self._rewards_gt, self._values_gt,
                   self._policy_gt)
     te = time.time()
     print "saving plt time: ", te - ts
     self._demo_trajs = self.generate_demonstrations()
     self._feat_map = np.eye(h * w) if feat_map is None else feat_map
     self._gpu_fraction = gpu_fraction
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    print(feat_map.shape)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Exemplo n.º 3
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[0, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    feat_map_torch = torch.tensor(feat_map, dtype=torch.float)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print('Deep Max Ent IRL training ..')
    rewards = deep_maxent_irl(feat_map_torch, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)
    #rewards = rewards.detach().numpy()
    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
def main():
    for seed in range(1):
        N_STATES = H * W
        # init the gridworld
        # rmap_gt is the ground truth for rewards
        rmap_gt = np.zeros([H, W])
        #goal coordinates
        rmap_gt[H - 1, W - 1] = R_MAX
        gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
        rewards_gt = np.reshape(rmap_gt, H * W, order='F')
        P_a = gw.get_transition_mat()
        values_gt, policy_gt = value_iteration.value_iteration(
            P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)

        # use identity matrix as feature
        feat_map = np.eye(N_STATES)

        # other two features. due to the linear nature,
        # the following two features might not work as well as the identity.
        # feat_map = feature_basis(gw)
        # feat_map = feature_coord(gw)
        np.random.seed(0)
        #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)

        trajs = mod.exp1_case2()
        rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                             N_ITERS)

        #np.savetxt('results/rewards.txt', rewards)

        #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)
        # plots
        plt.figure(figsize=(20, 20))
        img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                            'Reward Map',
                            block=False)
        plt.plot()
        #now = datetime.datetime.now()
        #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png"
        figname = "results/rewards_seed{0}".format(seed) + ".png"
        plt.savefig(figname)
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards2()

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    # use identity matrix as feature
    ## feat_map = np.eye(N_STATES)
    feat_map = feature_histogram(gw)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print 'Deep Max Ent IRL training ..'
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)

    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)

    plt.show()
Exemplo n.º 6
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 2, W - 2] = R_MAX
    rmap_gt[1, 1] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2,
                                N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                   N_ITERS)
    print('Deep Siamese Max Ent IRL training ..')
    rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs,
                                      LEARNING_RATE, N_ITERS)

    # plots
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'),
                        'Reward Map - Deep Maxent',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Deep Policy Maxent',
                        block=False)
    plt.show()
Exemplo n.º 7
0

if __name__ == "__main__":
    height = 5
    width = 5
    N_s = height * width
    N_a = 4  # left right up and down
    R_max = 10
    trans_prob = 0.7  # with 30% prob takes random action other than chosen
    gamma = 0.5
    lmbda = 10
    iterations = 100
    grid = np.zeros((height, width))
    grid[height - 1][width - 1] = R_max

    gw_mdp = gridworld.GridWorld(grid, {(height - 1, width - 1)}, trans_prob)

    R_mat = gw_mdp.get_reward_mat()
    # show rewards map
    show_heatmap(R_mat, 'Ground Truth of Reward')

    P = np.zeros((N_s, N_s, N_a))  # transition matrix

    for s_i in range(N_s):
        state_i = gw_mdp.idx2pos(s_i)
        for a in range(N_a):
            probabilities = gw_mdp.get_transition_states_and_probs(state_i, a)
            for state_j, prob in probabilities:
                s_j = gw_mdp.pos2idx(state_j)
                P[s_i, s_j, a] = prob
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards()
    gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    rmap_gt = gw.get_reward_mat()

    #temp
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 3, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 3, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 3, 3)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.show()
    sys.exit()

    # feat_map = np.eye(N_STATES)
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    feat_map = feature_histogram(gw)

    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)
    plt.show()
Exemplo n.º 9
0
def main():
    """
  Recover gridworld reward using linear programming IRL
  """

    H = 10
    W = 10
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    grid = [['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0',
             str(R_MAX)]]

    gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND)

    # solve the MDP using value iteration
    vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100)

    r_mat = gw.get_reward_mat()
    print 'show rewards map. any key to continue'
    img_utils.heatmap2d(r_mat, 'Reward Map - Ground Truth')

    v_mat = gw.get_values_mat(vi.get_values())
    print 'show values map. any key to continue'
    img_utils.heatmap2d(v_mat, 'Value Map - Ground Truth')

    # Construct transition matrix
    P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS))

    for si in range(N_STATES):
        statei = gw.idx2pos(si)
        for a in range(N_ACTIONS):
            probs = gw.get_transition_states_and_probs(statei, a)
            for statej, prob in probs:
                sj = gw.pos2idx(statej)
                # Prob of si to sj given action a
                P_a[si, sj, a] = prob

    # display policy and value in gridworld just for debug use
    gw.display_policy_grid(vi.get_optimal_policy())
    gw.display_value_grid(vi.values)

    # setup policy
    policy = np.zeros(N_STATES)
    for i in range(N_STATES):
        policy[i] = vi.get_action(gw.idx2pos(i))

    # solve for the rewards
    rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX)

    # display recoverred rewards
    print 'show recoverred rewards map. any key to continue'
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered')
    img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered')
Exemplo n.º 10
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    start_coordinates = (pixel_locations[0]['location-lat'][0],
                         pixel_locations[0]['location-long'][0])
    end_coordinates = (
        pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1],
        pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1])

    rmap_gt = np.zeros([W, H])
    rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX
    rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    # feat_map = np.eye(N_STATES)

    coast_map = np.load('Feature Maps/small_maps/coast.npy')
    coast_map = np.reshape(coast_map, (600, 1))

    forest_map = np.load('Feature Maps/small_maps/forest.npy')
    forest_map = np.reshape(coast_map, (600, 1))

    land_map = np.load('Feature Maps/small_maps/land.npy')
    land_map = np.reshape(coast_map, (600, 1))

    feat_map = np.hstack((coast_map, forest_map, land_map))

    # populate trajectories
    trajs = []
    terminal_state = end_coordinates
    for x in range(len(pixel_locations)):
        trajs.append([])
        for i in range(len(pixel_locations[x]) - 1):
            loc = pixel_locations[x].iloc[i]
            next_loc = pixel_locations[x].iloc[i + 1]
            action = get_action(loc, next_loc)
            reward = rmap_gt[int(next_loc[0]), int(next_loc[1])]
            is_done = np.array_equal(next_loc, terminal_state)

            trajs[x].append(
                Step(cur_state=int(gw.pos2idx(loc)),
                     action=action,
                     next_state=int(gw.pos2idx(next_loc)),
                     reward=reward,
                     done=is_done))

    print 'LP IRL training ..'
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX)
    print 'Max Ent IRL training ..'
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                N_ITERS)
    #   print 'Deep Max Ent IRL training ..'
    #   rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10)

    # plots
    fig = plt.figure()
    plt.subplot(1, 2, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    fig.savefig('GroundTruth.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    fig.savefig('LP.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    fig.savefig('MaxEnt.png')
Exemplo n.º 11
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    """while True:
      print "BAD_STATE入力"
      bad = raw_input('>> ')
      if bad == 'ok':
          break
      Bad_states.append(bad)
  """

    #print Bad_states
    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y)

    np.savetxt('results/rewards.txt', rewards)

    #print rewards

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    #print policy
    # plots
    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map',
                        block=False)
    plt.plot()

    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Policy Map',
                        block=False)
    plt.plot()
    plt.show()
Exemplo n.º 12
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[0, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')

    if ACT_RAND == 0:
        P_a = gw.get_transition_mat_deterministic()
    else:
        P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    #feat_map = np.eye(N_STATES)
    # feat_map = np.zeros(N_STATES).reshape((H, W))
    feat_map = np.random.rand(N_STATES).reshape((H, W))
    #feat_map = np.arange(N_STATES).reshape((H, W))
    if ARGS.conv:
        #feat_map[H-1, W-1] = -5
        #feat_map[0, W-1] = -5
        #feat_map[H-1, 0] = -5
        pass
    else:
        feat_map = feat_map.reshape(N_STATES)
    #feat_map = rmap_gt

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print 'Deep Max Ent IRL training ..'
    t = time.time()
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS, ARGS.conv, ARGS.sparse)
    print('time for dirl', time.time() - t)

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    print(
        'evd',
        value_iteration.expected_value_diff(P_a, rewards_gt, GAMMA,
                                            start_state_probs(trajs, N_STATES),
                                            values_gt, policy))

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Exemplo n.º 13
0
def main():
    """
  Recover gridworld reward using linear programming IRL
  """

    H = 10
    W = 10
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld including the reward
    grid = [
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['-1', '-1', '-1', '-1', '-1', '0', '0', '-1', '-1', '-1'],
        ## ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '-1', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '-1', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '-1', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0',
         str(R_MAX)]
    ]

    # custom
    for i, row in enumerate(grid):
        for j, e in enumerate(row):
            if e is '0':
                grid[i][j] = '-1'
            elif e is '-1':
                grid[i][j] = '-10'

    # grid, terminal state, trans_prob
    gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND)

    # solve the MDP using value iteration
    vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100)
    r_mat_gt = gw.get_reward_mat()
    v_mat_gt = gw.get_values_mat(vi.get_values())

    # Construct transition matrix
    P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS))

    for si in range(N_STATES):
        statei = gw.idx2pos(si)
        for a in range(N_ACTIONS):
            probs = gw.get_transition_states_and_probs(statei, a)
            for statej, prob in probs:
                sj = gw.pos2idx(statej)
                # Prob of si to sj given action a
                P_a[si, sj, a] = prob

    # display policy and value in gridworld just for debug use
    gw.display_policy_grid(vi.get_optimal_policy())
    gw.display_value_grid(vi.values)

    # display a path following optimal policy
    ## print 'show optimal path. any key to continue'
    path_gt = gw.display_path_grid(vi.get_optimal_policy())
    ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path')
    ## sys.exit()

    # setup policy
    policy = np.zeros(N_STATES)
    for i in range(N_STATES):
        policy[i] = vi.get_action(gw.idx2pos(i))

    #------------------ After getting optimal policy through iterations ------------------
    # solve for the rewards
    rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX)
    r_mat = np.reshape(rewards, (H, W), order='F')
    v_mat = gw.get_values_mat(vi.get_values())
    path = gw.display_path_grid(vi.get_optimal_policy())

    # display recoverred rewards
    print 'show recoverred rewards map. any key to continue'
    ## img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered')
    #img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered')

    # display a path following optimal policy
    print 'show optimal path. any key to continue'
    ## path = gw.display_path_grid(vi.get_optimal_policy())
    ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path')

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(r_mat_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(v_mat_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(r_mat, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(v_mat, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)

    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)

    plt.show()
Exemplo n.º 14
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')  #

    P_a = gw.get_transition_mat(
    )  #this is the transitin probablities of the matrix  5 action what is the probability of moving from state s1 to s2 give the action
    #getting the  transition probabilities in my case is just impossible ...

    values_gt, policy_gt = value_iteration.value_iteration(
        P_a, rewards_gt, GAMMA, error=0.01, deterministic=True
    )  #value iteration and policy acoding to the currrent rewards 0

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)  #features as one hot encoding

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(
        gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ,
        rand_start=RAND_START)  #this is the trajectories

    rewards = maxent_irl(
        feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS
    )  #need to input the feature map , transition priobalibliteis og the world

    pdb.set_trace()

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Exemplo n.º 15
0
def main():

	# named tuple to record demonstrations
	Step = namedtuple('Step','cur_state action next_state reward done')

	# argument parser for command line arguments
	parser = argparse.ArgumentParser(description=None)

	parser.add_argument('-wid', '--width', default=5, type=int, 
						help='width of the gridworld')
	parser.add_argument('-hei', '--height', default=5, type=int, 
						help='height of the gridworld')
	parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, 
						help='learning rate')
	parser.add_argument('-l', '--l_traj', default=20, type=int, 
						help='length of expert trajectory')

	parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', 
						help='when sampling trajectories, fix start positions')
	parser.add_argument('--rand_start', dest='rand_start', action='store_true', 
						help='when sampling trajectories, randomly pick start positions')
	parser.add_argument('--approx', dest='approx', action='store_true', 
						help='flag to perform approximation of psa')

	parser.add_argument('-g', '--gamma', default=0.9, type=float, 
						help='discount factor')
	parser.add_argument('-n', '--n_iters', default=20, type=int, 
						help='number of iterations')
	parser.add_argument('-t', '--n_trajs', default=100, type=int, 
						help='number of expert trajectories')
	parser.add_argument('-a', '--act_random', default=0.3, type=float, 
						help='probability of acting randomly')
	
	# set default value for rand_start variable
	parser.set_defaults(rand_start=False)

	# parse and print arguments
	args = parser.parse_args()

	# arguments for environment and irl algorithm
	r_max = 1 
	gamma = args.gamma
	width = args.width
	height = args.height
	l_traj = args.l_traj
	approx = args.approx
	n_iters = args.n_iters
	n_trajs = args.n_trajs
	act_rand = args.act_random
	rand_start = args.rand_start
	learning_rate = args.learning_rate

	# variables for number of actions and states
	n_actions = 5
	n_states = height * width

	# initialize the gridworld
	# rmap_gt is the ground truth for rewards
	rmap_gt = np.zeros([height, width])

	rmap_gt[0, width-1] = r_max
	rmap_gt[height-1, 0] = r_max
	rmap_gt[height-1, width-1] = r_max

	# create grid world instance
	gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand)

	# get true rewards, state transition dynamics
	rewards_gt = np.reshape(rmap_gt, height*width, order='F')
	P_a_true = gw.get_transition_mat()

	trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start)

	# get approximation of state transition dynamics
	P_a_approx = np.zeros((n_states, n_states, n_actions))
	for traj in trajs:
		for t in range(len(traj)):
			P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1

	for s in range(n_states):
		for a in range(n_actions):
			if np.sum(P_a_approx[s,:,a]) != 0:
				P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a])

	if approx:
		P_a = P_a_approx
	else:
		P_a = P_a_true

	# get true value function and policy from reward map
	values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True)

	# use identity matrix as feature
	feat_map = np.eye(n_states)

	# other two features. due to the linear nature, 
	# the following two features might not work as well as the identity.
	# feat_map = feature_basis(gw)
	# feat_map = feature_coord(gw)

	trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, 
									rand_start=rand_start)

	# perform inverse reinforcement learning to get reward function
	rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters)
	values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)

	# plots
	plt.figure(figsize=(20,4))
	plt.subplot(2, 2, 1)
	img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
	plt.subplot(2, 2, 2)
	img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False)
	plt.subplot(2, 2, 3)
	img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False)
	plt.subplot(2, 2, 4)
	img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False)
	plt.show()

	# plots for state transition dynamics
	plt.figure(figsize=(10,4))
	plt.subplot(2, 1, 1)
	img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False)
	plt.subplot(2, 1, 2)
	img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False)
	plt.show()