예제 #1
0
def training(pos):

    id_trajectory = load.load_trajectory(1000)

    graph_trajectories = tools.choose_trajectory(1000, id_trajectory)

    _graph = load.load_graph_traj(graph_trajectories)

    sample_trajectories = tools.choose_trajectory(100, id_trajectory)

    gw = gridworld.Gridworld(_graph, 0.9)

    feature_matrix = gw.feature_matrix(_graph)

    alpha = maxent.irl(_graph, feature_matrix, sample_trajectories, 1, 0.05)

    path = str("D:/Ubicomp/alpha" + str(pos) + ".txt")
    type(path)
    print path
    numpy.savetxt(path, alpha)

    _graph = graph.Graph([], {}, False, False)

    del _graph

    return alpha
예제 #2
0
def train(discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    # wind = 0.3
    trajectory_length = 268

    # gw = gridworld.Gridworld(grid_size, wind, discount)
    env = Env()
    trajectories = env.generate_trajectories(n_trajectories, trajectory_length,
                                             env.optimal_policy_deterministic)
    feature_matrix = env.feature_matrix()

    r = maxent.irl(feature_matrix, env.n_actions, discount,
                   env.transition_probability, trajectories, epochs,
                   learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205)

    pkl.dump(r, open("maxent_reward.pkl", 'wb'))

    return r
def main(grid_size, discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 3*grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)
    trajectories = gw.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            gw.optimal_policy)
    feature_matrix = gw.feature_matrix()
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    r = maxent.irl(feature_matrix, gw.n_actions, discount,
        gw.transition_probability, trajectories, epochs, learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
예제 #4
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)
    trajectories = gw.generate_trajectories(n_trajectories, trajectory_length,
                                            gw.optimal_policy)
    feature_matrix = gw.feature_matrix()
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    r = maxent.irl(feature_matrix, gw.n_actions, discount,
                   gw.transition_probability, trajectories, epochs,
                   learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
예제 #5
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    print("ow.n_states", ow.n_states)
    print("ow.n_actions", ow.n_actions)
    print("ow.transition_probability", ow.transition_probability,
          len(ow.transition_probability), len(ow.transition_probability[0]),
          len(ow.transition_probability[0][0]))
    print("ground_r", ground_r, len(ground_r))
    print("ow.discount", ow.discount)

    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])
    print(trajectories)
    feature_matrix = ow.feature_matrix(discrete=False)
    print("feature_matrix", feature_matrix, len(feature_matrix),
          len(feature_matrix[0]))

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
예제 #6
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.


    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 3*grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)
    
    #trajectories = gw.generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy)
    trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy)
    
    feature_matrix = gw.feature_matrix()
    #feature_matrix = gw.feature_matrix_goalVsOther()
    #feature_matrix = gw.feature_matrix_goalVsOtherTwo()
    #feature_matrix = gw.feature_matrix_goalVsOtherThree()
    
    #ground truth given by us as we know which states are good vs bad
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    
    #reard recovered using IRL algorithm
    recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount,
        gw.transition_probability, trajectories, epochs, learning_rate)
    
    #let's standardiese it    
    scaler = StandardScaler()
    standardised_reward=scaler.fit_transform(recovered_reward.reshape(-1,1))
    #print(recovered_reward)
    #print(standardised_reward)    
    
    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(standardised_reward.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount)
    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False)
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s])
    feature_matrix = ow.feature_matrix(discrete=False)
    r = maxent.irl(
        feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate
    )

    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.show()
예제 #8
0
def main(discount, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    #wind = 0.3
    #trajectory_length = 8

    lw = lindaworld.Lindaworld()
    #ground_r = np.array([lw.reward(s) for s in range(lw.n_states)])
    #policy = find_policy(lw.n_states, lw.n_actions, lw.transition_probability,
    #                     ground_r, lw.discount, stochastic=False)
    #trajectories = lw.generate_trajectories(n_trajectories,
    #                                        trajectory_length,
    #                                        lambda s: policy[s])
    feature_matrix = lw.feature_matrix(discrete=False)
    reward = maxent.irl(feature_matrix, lw.n_actions, discount,
                        lw.transition_probability, lw.trajectories, epochs,
                        learning_rate)

    policy = maxent.find_policy(lw.n_states, reward, lw.n_actions, discount,
                                lw.transition_probability)

    ## Save the policy
    print "Saving policy file...",
    sys.stdout.flush()
    policy_file = open("linda_policy.pnpo", "w")
    for i, a_prob in enumerate(policy):
        state = lw.state_list[i]
        actions = np.array(lw.action_list)[np.where(a_prob == np.amax(a_prob))]
        #print set(actions.tolist())
        action = random.choice(
            list(set(actions.tolist()) - set(["NO_ACTION"])))
        policy_file.write(state + "\t" + action + "\n")
    policy_file.close()
    print "DONE"

    #for i, ap in enumerate(policy):
    #    print lw.state_list[i], np.array(lw.action_list)[np.where(ap==np.amax(ap))]

    #for i, state in enumerate(lw.state_list):
    #    print reward[i], state

    rospy.init_node("reward_visualizer")

    ## Visualize the reward on the rviz markers
    # initialize interactive marker server
    rew_markers_publisher = rospy.Publisher("reward_visualizer",
                                            MarkerArray,
                                            latch=True,
                                            queue_size=10)

    # take the current markers
    top_map = rospy.wait_for_message("/topological_map",
                                     TopologicalMap,
                                     timeout=10)

    max_reward = max(reward)
    min_reward = min(reward)
    map_v = "/map"
    marker_array = MarkerArray()
    for index, node in enumerate(top_map.nodes):

        # get the corresponding state index
        currentstate_index = None
        closeststate_index = None
        current_marker_id = None
        closest_marker_id = None
        for i, state in enumerate(lw.state_list):
            if "CurrentNode_" + node.name in state.split(" "):
                currentstate_index = i
                current_marker_id = int(node.name.replace("WayPoint", ""))
                print "current>>>> ", state
            if "ClosestNode_" + node.name in state.split(" "):
                closeststate_index = i
                closest_marker_id = int(node.name.replace("WayPoint",
                                                          "")) * 100
                print "closest>>>> ", state

        # Current state marker
        if current_marker_id is not None:
            current_box_marker = Marker()
            if currentstate_index is not None:
                # get heatmap color
                r, g, b = rgb(min_reward, max_reward,
                              reward[currentstate_index])
                current_box_marker.text = str(reward[currentstate_index])
            else:
                current_box_marker.text = "0"
                r = g = b = 0.0
            current_box_marker.header.frame_id = map_v
            current_box_marker.type = Marker.CYLINDER
            current_box_marker.action = Marker.ADD
            current_box_marker.id = current_marker_id
            current_box_marker.scale.x = 0.5
            current_box_marker.scale.y = 0.5
            current_box_marker.scale.z = 0.1
            current_box_marker.pose = node.pose
            current_box_marker.color.r = r
            current_box_marker.color.g = g
            current_box_marker.color.b = b
            current_box_marker.color.a = 1.0

            marker_array.markers.append(current_box_marker)

        if closest_marker_id is not None:
            # Closest state marker
            closest_box_marker = Marker()
            if closeststate_index is not None:
                # get heatmap color
                r, g, b = rgb(min_reward, max_reward,
                              reward[closeststate_index])
                closest_box_marker.text = str(reward[closeststate_index])
                #print reward[closeststate_index]
            else:
                closest_box_marker.text = "0"
                r = g = b = 0.0
            closest_box_marker.header.frame_id = map_v
            closest_box_marker.type = Marker.CYLINDER
            closest_box_marker.action = Marker.ADD
            closest_box_marker.id = closest_marker_id
            closest_box_marker.scale.x = 2
            closest_box_marker.scale.y = 2
            closest_box_marker.scale.z = 0.01
            closest_box_marker.pose = node.pose
            closest_box_marker.color.r = r
            closest_box_marker.color.g = g
            closest_box_marker.color.b = b
            closest_box_marker.color.a = 0.7

            marker_array.markers.append(closest_box_marker)

    #print marker_array
    rew_markers_publisher.publish(marker_array)

    rospy.spin()
예제 #9
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, start_state):
    """
    Run maximum entropy inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    sx, sy = start_state
    wind = 0.3
    trajectory_length = 8

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)

    ow.plot_grid()

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)

    print("Policy = ", policy.shape)
    #    print ("policy - {}".format(policy))
    trajectories = ow.generate_trajectories(n_trajectories, trajectory_length,
                                            lambda s: policy[s])

    print("trajectories = ", trajectories.shape)
    #    for t in trajectories:
    #        ow.plot_grid("trajectory_{}.png".format(t), t)
    #    for t in trajectories:
    #        for s, a, r in t:
    #            print (ow.int_to_point(s), ow.actions[a], r)
    #        print ("---------")

    feature_matrix = ow.feature_matrix(discrete=False)

    r = maxent.irl(feature_matrix, ow.n_actions, discount,
                   ow.transition_probability, trajectories, epochs,
                   learning_rate)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   r,
                                   ow.discount,
                                   stochastic=False)

    new_trajectory = ow.generate_trajectories(1, trajectory_length,
                                              lambda s: recovered_policy[s],
                                              False, (sx, sy))
    print("new trajectory")
    for t in new_trajectory:
        ow.plot_grid("new_trajectory.png", t)
        for s, a, rw in t:
            print(ow.int_to_point(s), ow.actions[a], rw)
        print("---------")
    plt.subplot(1, 2, 1)
    plt.pcolor(ground_r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth reward")
    plt.subplot(1, 2, 2)
    plt.pcolor(r.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Recovered reward")
    plt.savefig("reward.png", format="png", dpi=150)
예제 #10
0
def main(grid_size, discount, n_trajectories, epochs, learning_rate):
    """
    Run maximum entropy inverse reinforcement learning on the gridworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    """

    wind = 0.1  #模拟干扰,噪声,专家出错导致动作非最优的概率
    trajectory_length = 3 * grid_size

    gw = gridworld.Gridworld(grid_size, wind, discount)
    ground_r = np.array([gw.reward(s) for s in range(gw.n_states)])
    # 由强化学习求最优策略让它代表专家策略产生示例轨迹
    policy = find_policy(gw.n_states, gw.n_actions, gw.transition_probability,
                         ground_r, discount)
    trajectories = gw.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            policy,
                                            random_start=True)
    # 画轨迹图 预处理前
    paths = []
    for i in trajectories:
        path = [j[0] for j in i]
        paths.append(path)
    draw_path(gw.grid_size, paths, '预处理前专家示例轨迹')
    # 预处理专家轨迹
    new_trajectories = pre_treated(gw.n_states, gw.n_actions, trajectories)
    # 画轨迹图 预处理后
    paths = []
    for i in new_trajectories:
        path = [j[0] for j in i]
        paths.append(path)
    draw_path(gw.grid_size, paths, '预处理后专家示例轨迹')

    feature_matrix = gw.feature_matrix()
    trajectories = [[(s, a, r) for (s, a, r, _) in trajectory]
                    for trajectory in trajectories]  # maxent irl处理的格式
    r1, R1 = maxent.irl(feature_matrix, gw.n_actions,
                        discount, gw.transition_probability,
                        np.array(trajectories), epochs, learning_rate)
    r1 = r1 / max(r1)
    loss1 = []
    for r in R1:
        r = r / max(r)
        loss = abs(r - ground_r).sum()
        loss1.append(loss)

    new_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory]
                        for trajectory in new_trajectories]  # maxent irl处理的格式
    feature_matrix = gw.feature_matrix()
    r2, R2 = maxent.irl(feature_matrix, gw.n_actions,
                        discount, gw.transition_probability,
                        np.array(new_trajectories), epochs, learning_rate)
    r2 = r2 / max(r2)
    loss2 = []
    for r in R2:
        r = r / max(r)
        loss = abs(r - ground_r).sum()
        loss2.append(loss)
    # 监督学习
    policy_sl = supervised_learning(new_trajectories, policy)  # 监督学习
    equal = 0
    for i in range(len(policy)):
        if policy_sl[i] == policy[i]:
            equal += 1 / len(policy)
    print("监督学习得到的策略正确率{}%".format(100 * equal))
    # 由监督学习策略生成轨迹
    sl_trajectories = gw.generate_trajectories(n_trajectories,
                                               trajectory_length,
                                               policy_sl,
                                               random_start=True)
    # 预处理监督学习策略轨迹
    new_sl_trajectories = pre_treated(gw.n_states, gw.n_actions,
                                      sl_trajectories)
    # 画轨迹图 监督学习策略
    paths = []
    for i in new_sl_trajectories:
        path = [j[0] for j in i]
        paths.append(path)
    draw_path(gw.grid_size, paths, '监督学习策略估计出的专家轨迹')
    new_sl_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory]
                           for trajectory in new_sl_trajectories]
    mix_trajectories = new_trajectories
    for trajectory in new_sl_trajectories:
        for i in new_trajectories:
            if trajectory[-1] == i[-1]:
                mix_trajectories.append(trajectory)
                break
    feature_matrix = gw.feature_matrix()
    r3, R3 = maxent.irl(feature_matrix, gw.n_actions,
                        discount, gw.transition_probability,
                        np.array(mix_trajectories), epochs, learning_rate)
    r3 = r3 / max(r3)
    loss3 = []
    for r in R3:
        r = r / max(r)
        loss = abs(r - ground_r).sum()
        loss3.append(loss)
    # # 2维图
    # plt.subplot(1, 3, 1)
    # plt.pcolor(r1.reshape((grid_size, grid_size)))
    # plt.colorbar()
    # plt.title("未进行预处理恢复的R")
    # plt.subplot(1, 3, 2)
    # plt.pcolor(r2.reshape((grid_size, grid_size)))
    # plt.colorbar()
    # plt.title("进行预处理恢复的R")
    # plt.subplot(1, 3, 3)
    # plt.pcolor(r3.reshape((grid_size, grid_size)))
    # plt.colorbar()
    # plt.title("预处理且监督学习恢复的R")
    # plt.show()

    # 画三维图
    # 绘图设置

    # X和Y的个数要相同
    X = range(gw.grid_size)
    Y = range(gw.grid_size)
    Z1 = r1
    Z2 = r2
    Z3 = r3
    # meshgrid把X和Y变成平方长度,比如原来都是4,经过meshgrid和ravel之后,长度都变成了16,因为网格点是16个
    xx, yy = np.meshgrid(X, Y)  # 网格化坐标
    X, Y = xx.ravel(), yy.ravel()  # 矩阵扁平化
    # # 设置柱子属性
    height = np.zeros_like(Z1)  # 新建全0数组,shape和Z相同,据说是图中底部的位置
    width = depth = 1  # 柱子的长和宽
    # # 颜色数组,长度和Z一致
    c = ['y'] * len(Z1)

    # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换
    fig = plt.figure()
    ax = fig.gca(projection='3d')  # 三维坐标轴
    ax.bar3d(X, Y, height, width, depth, Z1, color=c,
             shade=True)  # width, depth, height
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('reward_vale')
    plt.title("未进行预处理恢复的R")
    plt.show()

    # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换
    fig = plt.figure()
    ax = fig.gca(projection='3d')  # 三维坐标轴
    ax.bar3d(X, Y, height, width, depth, Z2, color=c,
             shade=True)  # width, depth, height
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('reward_vale')
    plt.title("预处理后恢复的R")
    plt.show()

    # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换
    fig = plt.figure()
    ax = fig.gca(projection='3d')  # 三维坐标轴
    ax.bar3d(X, Y, height, width, depth, Z3, color=c,
             shade=True)  # width, depth, height
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('reward_vale')
    plt.title("预处理且监督学习恢复的R")
    plt.show()

    # 画误差图
    plt.plot(range(epochs), loss1, color='r', label='未加预处理')
    plt.plot(range(epochs), loss2, color='g', label='加了预处理')
    plt.plot(range(epochs), loss3, color='b', label='预处理且监督学习')
    plt.legend(loc=1)  # 标签展示位置,数字代表标签具位置右上
    plt.xlabel('epochs')
    plt.ylabel('Error')
    plt.title('grid_size=10,discount=0.9')
    plt.plot()
    plt.show()
예제 #11
0
def main(grid_size,
         discount,
         n_objects,
         n_colours,
         n_trajectories,
         epochs,
         learning_rate,
         start_state,
         wind=0.0,
         algo="maxnet",
         mdp="gridworld"):
    """
    Run inverse reinforcement learning on the objectworld MDP.

    Plots the reward function.

    grid_size: Grid size. int.
    discount: MDP discount factor. float.
    n_objects: Number of objects. int.
    n_colours: Number of colours. int.
    n_trajectories: Number of sampled trajectories. int.
    epochs: Gradient descent iterations. int.
    learning_rate: Gradient descent learning rate. float.
    start_state: start location to generate trajectory from
    algo: IRL algo to run (Currently, support maxnet and deep_maxnet)
    """

    sx, sy = start_state
    trajectory_length = 8

    if mdp == "objectworld":
        import irl.mdp.objectworld as objectworld
        ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                     discount)
    elif mdp == "gridworld":
        import irl.mdp.gridworld as gridworld
        ow = gridworld.Gridworld(grid_size, wind, discount)

    ground_r = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy = find_policy(ow.n_states,
                         ow.n_actions,
                         ow.transition_probability,
                         ground_r,
                         ow.discount,
                         stochastic=False)
    optimal_v = optimal_value(ow.n_states,
                              ow.n_actions, ow.transition_probability,
                              normalize(ground_r), ow.discount)
    trajectories = ow.generate_trajectories(n_trajectories,
                                            trajectory_length,
                                            lambda s: policy[s],
                                            random_start=True)

    feature_matrix = ow.feature_matrix()

    print("trajectories = ", trajectories.shape)
    print("epochs = ", epochs)
    print("feature_matrix.shape = ", feature_matrix.shape)
    print("policy.shape = ", policy.shape)
    #    ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind), value=optimal_v)
    ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                    epochs, wind),
                 policy=policy,
                 value=optimal_v)

    r = []
    ground_svf = []
    if algo == "maxent":
        import irl.maxent as maxent
        ground_svf = maxent.find_svf(ow.n_states, trajectories)
        r = maxent.irl(feature_matrix, ow.n_actions, discount,
                       ow.transition_probability, trajectories, epochs,
                       learning_rate)
    elif algo == "deep_maxnet":
        import irl.deep_maxent as deep_maxent
        l1 = l2 = 0
        structure = (3, 3)
        r = deep_maxent.irl((feature_matrix.shape[1], ) + structure,
                            feature_matrix,
                            ow.n_actions,
                            discount,
                            ow.transition_probability,
                            trajectories,
                            epochs,
                            learning_rate,
                            l1=l1,
                            l2=l2)

    recovered_policy = find_policy(ow.n_states,
                                   ow.n_actions,
                                   ow.transition_probability,
                                   normalize(r),
                                   ow.discount,
                                   stochastic=False)
    recovered_v = value(recovered_policy, ow.n_states,
                        ow.transition_probability, normalize(r), ow.discount)

    new_trajectory = ow.generate_trajectories(n_trajectories,
                                              trajectory_length,
                                              lambda s: recovered_policy[s],
                                              True, (sx, sy))
    recovered_svf = maxent.find_svf(ow.n_states, new_trajectory)

    #    ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo,
    #                                n_trajectories, epochs, wind),
    #                                value=recovered_v)
    ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                 policy=recovered_policy,
                 value=recovered_v)

    #    print("new trajectory")
    #    for t in new_trajectory:
    #        for s, a, rw in t:
    #            print (ow.int_to_point(s), ow.actions[a], rw)
    #        print ("---------")
    y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5]

    plt.subplot(111)

    plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size)))
    plt.colorbar()
    plt.title("Groundtruth SVF")
    plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories,
                                                       epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size)))
    plt.title("Recovered SVF")
    plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size)))
    plt.title("Groundtruth reward")
    plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)

    plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size)))
    plt.title("Recovered reward")
    plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format(
        algo, n_trajectories, epochs, wind),
                format="png",
                dpi=150)