Пример #1
0
def main(method):

    if (method == "linear"):
        print("linear method")
        theta = maxent.irl(feature_matrix, gw.n_actions, discount,
                           gw.transition_probability, trajectories, epochs,
                           learning_rate)
    elif (method == "deep"):
        print("deep method")
        l1 = l2 = 0
        theta = deep_maxent.irl(
            (feature_matrix.shape[1], ) + network_structure,
            feature_matrix,
            gw.n_actions,
            discount,
            gw.transition_probability,
            trajectories,
            epochs,
            learning_rate,
            l1=l1,
            l2=l2)
    print(theta.shape)
    recovered_reward = feature_matrix.dot(theta).reshape((n_states, ))
    scaler = StandardScaler()
    standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1))

    plot.plot(ground_r, standardised_reward, grid_size)
Пример #2
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.


    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)

    #trajectories = gw.generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy)
    trajectories = gw.my_generate_trajectories(n_trajectories,
                                               trajectory_length,
                                               gw.optimal_policy)

    feature_matrix = gw.feature_matrix()
    print(trajectories.shape)

    #feature_matrix = gw.feature_matrix_goalVsOther()
    #feature_matrix = gw.feature_matrix_goalVsOtherTwo()
    #feature_matrix = gw.feature_matrix_goalVsOtherThree()

    #ground truth given by us as we know which states are good vs bad
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])

    #reard recovered using IRL algorithm
    recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount,
                                  gw.transition_probability, trajectories,
                                  epochs, learning_rate)

    #let's standardiese it
    scaler = StandardScaler()
    standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1))
    #print(recovered_reward)
    #print(standardised_reward)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(standardised_reward.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Пример #3
0
def execute_maxent(world, terminal, trajectories):
    """
    Maximum Entropy Inverse Reinforcement Learning
    """
    # set up features: we use one feature vector per state
    features = world.state_features()

    # choose our parameter initialization strategy:
    #   initialize parameters with constant
    init = optimizer.Constant(0.1)

    # choose our optimization strategy:
    #   we select exponentiated gradient descent with linear learning-rate decay
    optim = optimizer.ExpSga(lr=optimizer.linear_decay(lr0=0.2))

    # actually do some inverse reinforcement learning
    reward = maxent.irl(world.p_transition, features, terminal, trajectories, optim, init)

    return reward
Пример #4
0
def maxent(world, terminal, trajectories, avoid_states=None):
    """
    Maximum Entropy Inverse Reinforcement Learning
    """
    # set up features: we use one feature vector per state
    # features = W.state_features(world)
    features = W.state_custom_features(world, avoid_states, terminal)
    # choose our parameter initialization strategy:
    #   initialize parameters with constant
    init = O.Constant(1.0)

    # choose our optimization strategy:
    #   we select exponentiated gradient descent with linear learning-rate decay
    optim = O.ExpSga(lr=O.linear_decay(lr0=0.2))

    # actually do some inverse reinforcement learning
    reward = M.irl(world.p_transition, features, terminal, trajectories, optim, init)

    return reward
Пример #5
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability,
                         ground_r, ow.discount, stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s])

    feature_matrix = ow.feature_matrix(discrete=False)
    print(feature_matrix)
    r = maxent.irl(feature_matrix, ow.n_actions, discount,
        ow.transition_probability, trajectories, epochs, learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Пример #6
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):

    wind = 0.3
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)

    #trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy)
    #trajectories = gw.my_generate_trajectories_some_without_goal(n_trajectories,trajectory_length,gw.optimal_policy)
    trajectories = gw.my_generate_trajectories_multiple(
        n_trajectories, trajectory_length, gw.optimal_policy)

    feature_matrix = gw.feature_matrix()
    #feature_matrix = gw.feature_matrix_goalVsOther()
    #feature_matrix = gw.feature_matrix_goalVsOtherTwo()
    #feature_matrix = gw.feature_matrix_goalVsOtherThree()

    #ground truth given by us as we know which states are good vs bad
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])

    #reard recovered using IRL algorithm
    recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount,
                                  gw.transition_probability, trajectories,
                                  epochs, learning_rate)
    #let's standardiese it
    scaler = StandardScaler()
    standardised_reward = scaler.fit_transform(recovered_reward.reshape(-1, 1))

    #print(recovered_reward)
    #print(standardised_reward)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(standardised_reward.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Пример #7
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate,
         trajectory_length, trust, expert_type, random_start):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 1 - trust

    gw = gridworld.Gridworld(grid_size, wind, discount, expert_type)
    trajectories = gw.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            gw.optimal_policy,
                                            random_start=random_start)
    feature_matrix = gw.feature_matrix()
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    r = maxent.irl(feature_matrix, gw.n_actions, discount,
                   gw.transition_probability, trajectories, epochs,
                   learning_rate)

    print r.reshape((grid_size, grid_size))

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
Пример #8
0
    feature = [
        feature_1, feature_2, feature_3, feature_4, feature_5, feature_6
    ]

    feature = np.concatenate(
        (feature_1, feature_2, feature_3, feature_4, feature_5, feature_6),
        axis=1)

    return feature


if __name__ == '__main__':
    obstacle_leftup = [[0, 0], [0, 26], [30, 20], [40, 0]]
    obstacle_hei_wid = [[10, 27], [12, 18], [20, 10], [15, 6]]
    obstacles = (obstacle_leftup, obstacle_hei_wid)
    grid_map = GridMap(50, 50, obstacles)

    traj = load_trajectories('data/trajectories')
    print traj
    features = create_features(grid_map)
    r = maxent.irl(features, 4, 0.99, grid_map.transition_mat, traj, 20, 0.05)

    f = open('data/reward/grid_world_reward', 'w')
    pickle.dump(r, f)
    f.close()

    plt.pcolor(r.reshape((grid_map.x, grid_map.y)))
    plt.colorbar()
    plt.title("Recorvered Reward")
    plt.show()
        df_passive_u = RT_p_u.append(rp_p_u, ignore_index=True)
        df_passive_u = df_passive_u.append(m_p_u, ignore_index=True)
        df_passive_u = df_passive_u.sort_values(by='time')
        df_passive_u.reset_index(inplace=True)
        del df_passive_u['index']

        # create total dataframe
        df_total = merge_df(df_active_u, df_passive_u)

        # IRL
        trajectories, state_sequence, n_states, n_actions, feature_matrix, t_dict, c_dict = tweet_traj_next_reduced(
            df_total)  # compute trajectories and other information
        tp = compute_tp(state_sequence, n_states,
                        n_actions)  # compute transition probabilities
        r = maxent.irl(
            feature_matrix, n_actions, discount, tp, trajectories, epochs,
            learning_rate
        )  # maximum entropy IRL (comment this line if you want to use deep maximum entropy IRL --> line below)
        #         r = deep_maxent.irl((feature_matrix.shape[1],) + structure, feature_matrix,n_actions, discount, tp, trajectories, epochs,learning_rate, l1=l1, l2=l2) # deep maximum entropy IRL (comment this line if you want to use maximum entropy IRL --> line above)
        w = np.linalg.lstsq(feature_matrix,
                            r)[0]  # compute weights of each feauture

        # save results
        row = [user_name, r, w]
        df_results.loc[count] = row
        count += 1
    else:
        print("user %s has %s actions and %s states" % (user_name, n_a, n_p))

df_results.to_csv(
    "df_results_trolls_IRL.csv", index=False
)  # save rewards in a csv file (NB:change the name of the file for generic users in df_results_users_IRL.csv)
Пример #10
0
def train(q_learning_rate, inverse_learning_rate):
    e = env.Env(N_STEP, N_STATES, N_ACTIONS, N_FEATURES)
    demonstrations = np.loadtxt('test.csv', delimiter=',', dtype=str)
    e.get_init(demonstrations)
    q_table = np.random.uniform(size=(N_STEP, N_STATES, N_ACTIONS))
    feature_expectations = np.zeros(N_FEATURES)
    maxent.find_feature_expectations(demonstrations, feature_expectations, e)

    irl_feature_expectations = np.zeros(N_FEATURES)

    alpha = np.random.uniform(size=(N_FEATURES, ))
    e.set_alpha(alpha)

    grad = []

    for episode in range(2500000):
        state = e.reset()
        t = 0

        if episode != 0 and episode % 50000 == 0:
            # update alpha
            # print(episode, q_table)
            # print(episode, irl_feature_expectations)
            learner = irl_feature_expectations / float(episode)
            gradient = maxent.irl(feature_expectations, learner, alpha,
                                  inverse_learning_rate)
            print(gradient)
            grad.append(np.linalg.norm(gradient))
            e.set_alpha(alpha)

        series = [state]
        irl_feature_expectations += e.feature_vector(series)
        while True:
            action = choose_action(q_table[t][int(state)])

            next_state = e.step(action)
            series.append(next_state)

            reward = e.get_reward(series)
            update_q_table(t, state, action, reward, next_state,
                           q_learning_rate, q_table)
            irl_feature_expectations += e.feature_vector(series)
            t += 1
            state = next_state

            if t == 5:
                break

    print(alpha)
    print(grad)
    plt.plot(grad,
             label='q_learning_rate: ' + str(q_learning_rate) +
             ' inverse_learning_rate: ' + str(inverse_learning_rate))
    # plt.ylim(0, int(max(grad))+1)
    plt.title('q_learning_rate: ' + str(q_learning_rate) +
              ' inverse_learning_rate: ' + str(inverse_learning_rate))
    plt.savefig('train_' + str(q_learning_rate) + '_' +
                str(inverse_learning_rate) + '.png')

    episodes = []
    for demo in demonstrations:
        episode = demo[0]
        state = demo[0]
        t = 0
        while True:
            action = choose_action(q_table[t][int(state)], greedy=0)
            next_state = e.step(action)

            t += 1
            state = next_state
            episode += state

            if t == 5:
                break
        episodes.append(episode)

    with open(
            str(q_learning_rate) + '_' + str(inverse_learning_rate) +
            'out.csv', 'w') as w:
        for episode in episodes:
            w.write(episode)
            w.write('\n')