def iter(cls, model, Q):
        V = util.classes.NumMap()
        # Compute V(s) = max_{a} Q(s,a)
        for s in model.S():
            V_s = util.classes.NumMap()
            for a in model.A(s):
                V_s[a] = Q[ (s,a) ]
            if len(V_s) > 0:
                V[s] = V_s.max()
            else:
                V[s] = 0.0
        
        # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') 
        QQ = util.classes.NumMap()
        for s in model.S():
            for a in model.A(s):
                value = model.R(s,a)
                T = model.T(s,a)
                value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in  T.items()] )
                QQ[ (s,a) ] = value

        # to find the log policy, find the argmax at each state and then create a new Q with each (s,a) = oldQ - (max for that state)


        return QQ
    def solve(self, model, initial, true_samples, other_policy):
        '''
        Returns a pair (agent, weights) where the agent attempts to generalize
        from the behavior observed in samples and weights is what was combined with
        the MDP to generate agent.
        
        model: an MDP with a linear reward functions.  Parameters WILL be overwritten.
        initial: initial distribution over states
        samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal
            policy.
        '''
        # Initial weight vector
#        w_0 = model.feature_function.params
        self.other_policy = other_policy
        self.full_initial = util.classes.NumMap()
        for s in model.S():
            self.full_initial[s] = 1.0
        self.full_initial = self.full_initial.normalize()
        
        # Compute feature expectations of agent = mu_E from samples
        (mu_E, sa_freq) = self.feature_expectations2(model, true_samples)
        print("True Samples", mu_E)
        
        self.Q_value = None
        w = numpy.random.random((model.reward_function.dim,))
#        w = numpy.zeros((model.reward_function.dim,))
        
        w_opt = scipy.optimize.fmin_bfgs(self.maxEntObjValue, w, self.maxEntObjGradient, (model, initial, mu_E, len(true_samples[0]), sa_freq), self._epsilon, maxiter=self._max_iter)
             
        print(w_opt)
        model.reward_function.params = w_opt
        agent = self._solver.solve(model)
        
        return (agent, w_opt)
def QValueSoftMaxSolve(model, thresh = 1):
    
    v = util.classes.NumMap()
    for s in model.S():
        v[s] = 0.0
        
        
    diff = 100.0
    
    while diff >= thresh:
        vp = v
        
        Q = util.classes.NumMap()
        for s in model.S():
            for a in model.A(s):
                value = model.R(s,a)
                T = model.T(s,a)
                value += sum( [model.gamma*t*v[s_prime] for (s_prime,t) in  T.items()] )
                Q[ (s,a) ] = value            
        
        v = util.classes.NumMap()

        # need the max action for each state!
        for s in model.S():
            maxx = None
            for a in model.A(s):
                if (maxx == None) or Q[(s,a)] > maxx:
                    maxx = Q[(s,a)]


            e_sum = 0
            for a in model.A(s):
                e_sum += math.exp(Q[(s,a)] - maxx)
                
            v[s] = maxx + math.log(e_sum)
        
        diff = max(abs(value - vp[s]) for (s, value) in v.iteritems())
        
        
    logp = util.classes.NumMap()
    for (sa, value) in Q.iteritems():
        logp[sa] = value - v[sa[0]]
    return logp
示例#4
0
def project(model, agent, initial, t_max):
    '''
	Does a full state space simulation, allowing
	each state at each timestep to have < 1.0
	probability of occupancy
	'''
    # will return an array of dicts

    # build initial state distribution
    result = [
        {},
    ]
    for s in model.S():
        if s in initial.keys():
            result[0][s] = initial[s]
        else:
            result[0][s] = 0

    t = 1

    while t < t_max:
        dist = {}
        for s in model.S():
            if result[t - 1][s] == 0:
                continue

            for (a, pr_a) in agent.actions(s):
                for (s_prime, pr_s_prime) in model.T(s, a):
                    if not s_prime in dist:
                        dist[s_prime] = pr_a * pr_s_prime * result[t - 1][s]
                    else:
                        dist[s_prime] += pr_a * pr_s_prime * result[t - 1][s]

        for s in model.S():
            if not s in dist.keys:
                dist[s] = 0

        result.append(dist)
        t += 1

    return result
 def iter(cls, model, Q):
     V = util.classes.NumMap()
     # Compute V(s) = max_{a} Q(s,a)
     for s in model.S():
         V_s = util.classes.NumMap()
         for a in model.A(s):
             V_s[a] = Q[ (s,a) ]
         if len(V_s) > 0:
             V[s] = V_s.max()
         else:
             V[s] = 0.0
     
     # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') 
     QQ = util.classes.NumMap()
     for s in model.S():
         for a in model.A(s):
             value = model.R(s,a)
             T = model.T(s,a)
             value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in  T.items()] )
             QQ[ (s,a) ] = value
     return QQ
    def solve(self, model, true_samples, s_r_prior, gridsize, max_states):
        '''
        Returns a pair (agent, weights) where the agent attempts to generalize
        from the behavior observed in samples and weights is what was combined with
        the MDP to generate agent.
        
        model: an MDP with a linear reward functions.  Parameters WILL be overwritten.
        initial: initial distribution over states
        samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal
            policy.
        '''
        # Initial weight vector
#        w_0 = model.feature_function.params

        R = patrol.rewardbayesian.BayesianPatrolReward(len(model.S())*len(model.A()), gridsize)
        
        model.reward_function = R
        
        pi = self._solver.solve(model)
        
        for i in range(self._max_iter):
            R_tilde = R.randNeighbor()
            
            model.reward_function = R_tilde
            
            Q_pi = self._q_value_solver.solve(model)
            
            found_worse = False
            for history in true_samples:
                for (s, a) in history:
                    
                    if s.location[0] >= 0 and s.location[0] < max_states and Q_pi[(s, pi.actions(s).keys()[0])] < Q_pi[(s, a)]:
#                        print(a, Q_pi[(s, a)], pi.actions(s).keys()[0], Q_pi[(s, pi.actions(s).keys()[0])])
                        found_worse = True
                        break
                
            if found_worse:
                pi_tilde = self._solver.solve(model)
                
                chance = min(1, s_r_prior.prior(pi_tilde, R_tilde) / s_r_prior.prior(pi, R))

                if random.random() < chance:
                    pi = pi_tilde
                    R = R_tilde
            else:
                chance = min(1, s_r_prior.prior(pi, R_tilde) / s_r_prior.prior(pi, R))
                if random.random() < chance:
                    R = R_tilde
                
        model.reward_function = R
        return pi
 def maxEntObjGradient(self, w, model, initial, mu_E, true_samples_len, sa_freq):
     
     if (self.Q_value == None):
         model.reward_function.params = w
         agent = self._solver.solve(model) #shouldn't be doing this!
         
     else:
         policy = {}
         for s in model.S():
             actions = util.classes.NumMap()
             for a in model.A(s):
                 actions[a] = self.Q_value[ (s,a) ]
             policy[s] = actions.argmax()
         agent = mdp.agent.MapAgent(policy)
         
     samples = self.generate_samples(model, agent, initial, true_samples_len)
     _mu = self.feature_expectations(model, samples)
                 
     print(w, mu_E - _mu)
     return -( mu_E - _mu)
 def solve(self, model):
     '''Returns a map of (state, action) => q-value determined by this solver'''
     Q = util.classes.NumMap()
     for i in range(self._max_iter):
         Q = self.iter(model, Q)
         
                 
     returnQ = util.classes.NumMap()
     
     V = util.classes.NumMap()
     # Compute V(s) = max_{a} Q(s,a)
     for s in model.S():
         V_s = util.classes.NumMap()
         for a in model.A(s):
             V_s[a] = Q[ (s,a) ]
         if len(V_s) > 0:
             V[s] = V_s.max()
         else:
             V[s] = 0.0
     
     for (sa, value) in Q.iteritems():
         returnQ[sa] = value - V[sa[0]]
     
     return returnQ
示例#9
0
def init():

    global pub
    global all_stop_pub
    global pub_status
    global model
    global policy
    global lastpublish
    global savedPose
    global amclmessageReceived
    global ground_truth
    global add_delay
    global otherPose
    global lastGoal
    global go
    global maxsightdistance
    global dr_client
    global home
    global timestep
    global current_cg, curr_st, next_st

    rospy.init_node('patrolcontrol')

    global recordConvTraj, useRegions
    recordConvTraj = 0
    useRegions = 0
    recordConvTraj = int(rospy.get_param("~recordConvTraj"))
    observableStates = int(rospy.get_param("~observableStates"))
    useRegions = int(rospy.get_param("~useRegions"))
    data_log = str(rospy.get_param("~dataLog"))

    rospy.Subscriber("amcl_pose", PoseWithCovarianceStamped, handle_pose)
    pub = rospy.Publisher("move_base_simple/goal", PoseStamped)

    all_stop_pub = rospy.Publisher("all_stop_enabled", std_msgs.msg.Bool)

    initialpose = rospy.Publisher("initialpose", PoseWithCovarianceStamped)
    rospy.Subscriber("base_pose_ground_truth", Odometry, save_ground_truth)

    pub_status = rospy.Publisher("/study_attacker_status", String, latch=True)

    global skip_interaction
    if not skip_interaction:
        if add_delay:
            rospy.Subscriber("/robot_0/base_pose_ground_truth", Odometry,
                             handle_other)
        else:
            rospy.Subscriber("/robot_1/base_pose_ground_truth", Odometry,
                             handle_other)

    random.seed()
    np.random.seed()

    ## Initialize constants

    p_fail = 0.05
    longHallway = 10
    shortSides = 2
    patrolAreaSize = longHallway + shortSides + shortSides
    observableStateLow = patrolAreaSize / 2 - 1
    observableStateHigh = patrolAreaSize / 2
    observableStateLow = 0
    observableStateHigh = patrolAreaSize

    # calculate farness for each node in the patrolled area
    farness = np.zeros(patrolAreaSize)
    for i in range(patrolAreaSize):
        sum = 0
        for j in range(patrolAreaSize):
            sum += abs(i - j)

        farness[i] = sum

    global m
    global ogmap
    current_cg = None
    ## Create reward function

    if (m == "boydright"):
        mapparams = boydrightMapParams(False)
        ogmap = OGMap(*mapparams)
        ## Create Model
        model = patrol.model.PatrolModel(p_fail, None, ogmap.theMap())
        model.gamma = 0.9

    elif m == "largeGridPatrol":
        current_cg = 4
        mapparams = largeGridMapParams(False)
        ogmap = OGMap(*mapparams)
        ## Create Model
        # 		model = patrol.model.PatrolModel(p_fail, PatrolState(np.array( [6, 8, 0] )), ogmap.theMap())
        model = patrol.model.PatrolExtendedModel(0.01, PatrolExtendedState\
                  (np.array( [-1,-1, -1] ),0), \
                  ogmap.theMap())
        model.gamma = 0.9

    elif (m == "boydright2"):
        mapparams = boydright2MapParams(False)

        ogmap = OGMap(*mapparams)

        ## Create Model

        model = patrol.model.PatrolModel(p_fail, None, ogmap.theMap())

        model.gamma = 0.9

    else:
        mapparams = boyd2MapParams(False)
        ogmap = OGMap(*mapparams)
        ## Create Model
        model = patrol.model.PatrolModel(p_fail, None, ogmap.theMap())
        model.gamma = 0.9

    # Call external solver here instead of Python based
    args = [
        "boydsimple_t",
    ]
    import subprocess

    p = subprocess.Popen(args,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    stdin = m + "\n"
    stdin += "1.0\n"

    (transitionfunc, stderr) = p.communicate(stdin)

    args = [
        "boydpatroller",
    ]
    p = subprocess.Popen(args,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)

    stdin = m + "\n"
    stdin += str(useRegions) + "\n"

    stdin += transitionfunc

    f = open(home + "/patrolstudy/toupload/patrollerT.log", "w")
    f.write(stdin)
    f.close()

    (stdout, stderr) = p.communicate(stdin)
    print("\n computed patroller's policy")

    if m == "largeGridPatrol":
        correctpolicy = home + "/patrolstudy/largeGridpolicy_mdppatrol"
    if m == "boydright":
        correctpolicy = home + "/patrolstudy/boydright_policy"
    if m == "boydright2":
        correctpolicy = home + "/patrolstudy/boydright2_policy"
    if m == "boyd2":
        correctpolicy = home + "/patrolstudy/boydpolicy_mdppatrolcontrol"

    # stdout now needs to be parsed into a hash of state => action, which is then sent to map agent
# 	correctpolicy = "/home/saurabh/patrolstudy/largeGridpolicy_mdppatrol"
    f = open(correctpolicy, "w")
    f.write("")
    f.close()

    f = open(correctpolicy, "w")
    outtraj = ""
    p = {}
    stateactions = stdout.split("\n")
    for stateaction in stateactions:
        temp = stateaction.split(" = ")
        if len(temp) < 2: continue
        state = temp[0]
        action = temp[1]

        outtraj += state + " = " + action + "\n"

        # 		ps = patrol.model.PatrolState(np.array([int(pieces[0]), int(pieces[1]), int(pieces[2])]))
        if m == "largeGridPatrol":
            state = state.split(";")
            loc = state[0]
            current_goal = int(state[1])
            loc = loc[1:len(loc) - 1]
            pieces = loc.split(",")
            location = np.array(
                [int(pieces[0]),
                 int(pieces[1]),
                 int(pieces[2])])
            ps = patrol.model.PatrolExtendedState(location, current_goal)

        else:
            state = state[1:len(state) - 1]
            pieces = state.split(",")
            ps = patrol.model.PatrolState(
                np.array([int(pieces[0]),
                          int(pieces[1]),
                          int(pieces[2])]))

        if action == "MoveForwardAction":
            a = patrol.model.PatrolActionMoveForward()
        elif action == "TurnLeftAction":
            a = patrol.model.PatrolActionTurnLeft()
        elif action == "TurnAroundAction":
            a = patrol.model.PatrolActionTurnAround()
        elif action == "TurnRightAction":
            a = patrol.model.PatrolActionTurnRight()
        else:
            a = patrol.model.PatrolActionStop()

        p[ps] = a

    f.write(outtraj)
    f.close()

    #  	pub_status.publish(String("abortrun"))

    from mdp.agent import MapAgent
    policy = MapAgent(p)

    # 	sample trajectories for m == "largeGrid":
    global recordConvTraj
    if m == "largeGridPatrol" and recordConvTraj == 1:
        from util.classes import NumMap
        state_dict = {}
        for s in model.S():
            if s.location[0] <= 7 and s.location[1] <= 1:
                state_dict[s] = 1.0
# 		state_dict = { PatrolState([0,0,0]):1.0 }
        initial = NumMap(state_dict).normalize()
        # 		data_log = "/home/saurabh/patrolstudy/toupload/recd_convertedstates_14_14_200states_LGR.log"
        f = open(data_log, "w")
        from mdp.simulation import sample_traj
        n_samples = 45
        t_max = 50
        outtraj = ""
        # 		outtraj2 = "observableStates: "
        # 		outtraj2 += str(observableStates)
        # 		s = ogmap.toState((6, 0, 0), False)
        # 		s = PatrolState(np.array( [6, 0, 0] ))
        # 		outtraj2 += str(largeGridisvisible(s, observableStates))
        # 		pub_status.publish(String("datacollected"))

        pair_ct = 0
        for i in range(n_samples):
            traj_list = sample_traj(model, t_max, initial, policy)
            for (s, a, s_p) in traj_list:
                # 				outtraj2 += str(s.location[0:3])
                # 				outtraj2 += str(largeGridisvisible(s, observableStates))+"\n"
                if largeGridisvisible(s, observableStates):
                    s = s.location[0:3]
                    # print(s)
                    # print(sap[1].__class__.__name__)
                    outtraj += str(s[0]) + "," + str(s[1]) + "," + str(
                        s[2]) + ","
                    if a.__class__.__name__ == "PatrolActionMoveForward":
                        outtraj += "PatrolActionMoveForward"
                    elif a.__class__.__name__ == "PatrolActionTurnLeft":
                        outtraj += "PatrolActionTurnLeft"
                    elif a.__class__.__name__ == "PatrolActionTurnRight":
                        outtraj += "PatrolActionTurnRight"
                    elif a.__class__.__name__ == "PatrolActionTurnAround":
                        outtraj += "PatrolActionTurnAround"
                    else:
                        outtraj += "PatrolActionStop"
                else:
                    outtraj += "None"

                outtraj += "\n"
                pair_ct += 1
# 			outtraj2 += "\n"
            if pair_ct >= 200:
                outtraj += "ENDREC\n"
                pair_ct = 0

        f.write(outtraj)
        # 		f.write(outtraj2)
        f.close()

#	policy = mdp.solvers.PolicyIteration(30, mdp.solvers.IteratingPolicyEvaluator2(.5)).solve(model)
#	policy = mdp.solvers.PolicyIteration(30, mdp.solvers.IteratingPolicyEvaluator(20)).solve(model)
#	policy = mdp.solvers.ValueIteration(.5).solve(model)
    print("Finished solving mdp")

    has_slowed_down = False
    dr_client = dynamic_reconfigure.client.Client(
        "move_base_node/TrajectoryPlannerROS")
    print("dr_client initialized")

    r = rospy.Rate(8)
    while not rospy.is_shutdown():

        # 		print "savedPose is not None and otherPose is not None and canSee(savedPose, otherPose)"
        # 		print (savedPose is not None and otherPose is not None and canSee(savedPose, otherPose))
        if m == "largeGridPatrol":
            print current_cg

        if savedPose is not None and otherPose is not None and canSee(
                savedPose, otherPose):

            if not add_delay:
                params = {'max_vel_x': 0.2}
                config = dr_client.update_configuration(params)
                has_slowed_down = True

            if not add_delay and lastGoal is not None:

                a = otherPose.orientation

                angles2 = tf.transformations.euler_from_quaternion(
                    [a.x, a.y, a.z, a.w])

                if (angles2[2] < 0):
                    angles2 = (0, 0, 2 * math.pi + angles2[2])
                otherState = ogmap.toState(
                    (otherPose.position.x, otherPose.position.y, angles2[2]),
                    False)

                if lastGoal.conflicts(otherState):
                    #					if rospy.get_time() - lastpublish >= timestep:
                    lg = lastGoal
                    while (lg.conflicts(otherState)):
                        if m == "largeGridPatrol":
                            curr_st = PatrolExtendedState(
                                lg.location, current_cg)
                            # print "(lg.conflicts(otherState)) next_st = policy.actions(curr_st).keys()[0].apply(curr_st)"
                            next_st = policy.actions(curr_st).keys()[0].apply(
                                curr_st)
                            lg = PatrolState(next_st.location)
                        else:
                            lg = policy.actions(lg).keys()[0].apply(lg)

                    sendGoal(lg)
                else:
                    nextGoal()

            if add_delay and go:

                go = False

                topub = std_msgs.msg.Bool()
                topub.data = True

                all_stop_pub.publish(topub)

        elif savedPose is not None:

            if otherPose is not None and not canSee(
                    savedPose, otherPose) and has_slowed_down:
                params = {'max_vel_x': 0.5}
                config = dr_client.update_configuration(params)
                has_slowed_down = False


# 			print "nextGoal() in elif savedPose is not None"
            nextGoal()

        time.sleep(0.25)
    def solve(self, model, initial, true_samples, other_policy):
        '''
        Returns a pair (agent, weights) where the agent attempts to generalize
        from the behavior observed in samples and weights is what was combined with
        the MDP to generate agent.
        
        model: an MDP with a linear reward functions.  Parameters WILL be overwritten.
        initial: initial distribution over states
        samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal
            policy.
        '''
        # Initial weight vector
#        w_0 = model.feature_function.params
        self.other_policy = other_policy
        self.full_initial = util.classes.NumMap()
        for s in model.S():
            self.full_initial[s] = 1.0
        self.full_initial = self.full_initial.normalize()
                
        # Compute feature expectations of agent = mu_E from samples
        mu_E = self.feature_expectations2(model, true_samples)
        print("True Samples", mu_E)
        # Pick random policy pi^(0)
        agent = mdp.agent.RandomAgent( model.A() )
        
        # Calculate feature expectations of pi^(0) = mu^(0)
        samples = self.generate_samples(model, agent, initial, len(true_samples[0]))
        mu = self.feature_expectations(model, samples )

#        mu = self.feature_expectations2(model, initial, agent )
        lastT = 0
								
        for i in range(self._max_iter):
            # Perform projections to new weights w^(i)
            if i == 0:
                mu_bar = mu
            else:
                mmmb = mu - mu_bar
                mu_bar = mu_bar + numpy.dot( mmmb, mu_E-mu_bar )/numpy.dot( mmmb,mmmb )*mmmb
            w = mu_E - mu_bar
            t = numpy.linalg.norm(mu_E - mu_bar)
            w[0] = abs(w[0])
            print(w)
            model.reward_function.params = w
            
            print 'IRLApproxSolver Iteration #{},t = {:4.4f}'.format(i,t)
            if t < self._epsilon:
                break
            if abs(t - lastT) < .000001:
                break

            lastT = t
            
            # Compute optimal policy used R(s,a) = dot( feature_f(s,a), w^(i) )
            if (numpy.linalg.norm(mu) == 0):
                agent = mdp.agent.RandomAgent( model.A() )
            else:
                agent = self._solver.solve(model)
                        
            # Compute feature expectations of pi^(i) = mu^(i)
            samples = self.generate_samples(model, agent, initial, len(true_samples[0]))
            mu = self.feature_expectations(model, samples)
            print(mu)
#            mu = self.feature_expectations2(model, initial, agent)
            
        # Restore initial weight vector
#        model.feature_function.params = w_0
        return (agent, w)
示例#11
0
def stupidPythonIdiots():
    global resetPub
    global perceptPub
    global calcPub
    global mdpId
    global states
    global actions

    print(mdpId)
    rospy.wait_for_service('irlsimulate')

    initService()
    initNode()

    print(mdpId)
    p_fail = 0.05
    longHallway = 10
    shortSides = 4
    patrolAreaSize = longHallway + shortSides + shortSides
    observableStateLow = 7
    observableStateHigh = 8

    # calculate farness for each node in the patrolled area
    farness = np.zeros(patrolAreaSize)
    for i in range(patrolAreaSize):
        sum = 0
        for j in range(patrolAreaSize):
            sum += abs(i - j)

        farness[i] = sum

    ## Create reward function
    reward = patrol.reward.PatrolReward(patrolAreaSize, farness,
                                        observableStateLow,
                                        observableStateHigh)
    reward_weights = np.zeros(reward.dim)
    reward_weights[0] = .2
    reward_weights[1] = .35
    reward_weights[2] = .45
    reward_weights[3] = 0
    reward_weights[4] = 0

    reward.params = reward_weights

    ## Create Model
    model = patrol.model.PatrolModel(p_fail, longHallway, shortSides)
    model.reward_function = reward
    model.gamma = 0.999

    states = model.S()
    actions = model.A()

    ## Create initial distribution
    initial = util.classes.NumMap()
    for s in model.S():
        initial[s] = 1.0
    initial = initial.normalize()

    ## Define feature function (approximate methods only)
    #    feature_function = mdp.etc.StateActionFeatureFunction(model)
    #    feature_function = mdp.etc.StateFeatureFunction(model)
    #    feature_function = gridworld.etc.GWLocationFF(model)

    ## Define player
    #    policy = mdp.agent.HumanAgent(model)
    opt_policy = mdp.solvers.ValueIteration(50).solve(model)

    j = 0
    for (s, a, r) in mdp.simulation.simulate(model, opt_policy, initial, 68):
        if (s.location[0] < observableStateLow):
            pass
        elif (s.location[0] > observableStateHigh):
            pass
        else:
            perceptPub.publish(
                percept(mdpId=mdpId,
                        state=stateToId(s),
                        action=actionToId(a),
                        time=j))
        j += 1

    centerObs = util.classes.NumMap()
    for s in model.S():
        centerObs[s] = 0
        if (s.location[0] == (observableStateLow + observableStateHigh) / 2):
            centerObs[s] = 1
    centerObs = centerObs.normalize()
    s = mdpId
    calcPub.publish(String(s))

    raw_input("Percepts Sent, Press Enter to continue...")

    policyPxy = rospy.ServiceProxy('irlpolicy', policy)
    est_p = policyPxy(policyRequest(mdpId))

    est_policy = util.classes.NumMap()
    for (i, a) in enumerate(est_p.policy):
        est_policy[idToState(i)] = idToAction(a)

    mdp.etc.policy_report(opt_policy, est_policy,
                          mdp.solvers.ExactPolicyEvaluator(), model, centerObs)

    for s in model.S():
        print 's = %s, pi*(s) = %s, pi_E(s) = %s' % (s, opt_policy.actions(s),
                                                     est_policy.actions(s))
    print 'pi* and pi_E disagree on {} of {} states'.format(
        len([
            s for s in model.S()
            if opt_policy.actions(s) != est_policy.actions(s)
        ]), len(model.S()))

    simulatePxy = rospy.ServiceProxy('irlsimulate', simulate)
    enc_policy = simulatePxy(simulateRequest(mdpId)).state_actions