def iter(cls, model, Q): V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') QQ = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in T.items()] ) QQ[ (s,a) ] = value # to find the log policy, find the argmax at each state and then create a new Q with each (s,a) = oldQ - (max for that state) return QQ
def QValueSoftMaxSolve(model, thresh = 1): v = util.classes.NumMap() for s in model.S(): v[s] = 0.0 diff = 100.0 while diff >= thresh: vp = v Q = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*v[s_prime] for (s_prime,t) in T.items()] ) Q[ (s,a) ] = value v = util.classes.NumMap() # need the max action for each state! for s in model.S(): maxx = None for a in model.A(s): if (maxx == None) or Q[(s,a)] > maxx: maxx = Q[(s,a)] e_sum = 0 for a in model.A(s): e_sum += math.exp(Q[(s,a)] - maxx) v[s] = maxx + math.log(e_sum) diff = max(abs(value - vp[s]) for (s, value) in v.iteritems()) logp = util.classes.NumMap() for (sa, value) in Q.iteritems(): logp[sa] = value - v[sa[0]] return logp
def iter(cls, model, Q): V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') QQ = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in T.items()] ) QQ[ (s,a) ] = value return QQ
def solve(self, model, true_samples, s_r_prior, gridsize, max_states): ''' Returns a pair (agent, weights) where the agent attempts to generalize from the behavior observed in samples and weights is what was combined with the MDP to generate agent. model: an MDP with a linear reward functions. Parameters WILL be overwritten. initial: initial distribution over states samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal policy. ''' # Initial weight vector # w_0 = model.feature_function.params R = patrol.rewardbayesian.BayesianPatrolReward(len(model.S())*len(model.A()), gridsize) model.reward_function = R pi = self._solver.solve(model) for i in range(self._max_iter): R_tilde = R.randNeighbor() model.reward_function = R_tilde Q_pi = self._q_value_solver.solve(model) found_worse = False for history in true_samples: for (s, a) in history: if s.location[0] >= 0 and s.location[0] < max_states and Q_pi[(s, pi.actions(s).keys()[0])] < Q_pi[(s, a)]: # print(a, Q_pi[(s, a)], pi.actions(s).keys()[0], Q_pi[(s, pi.actions(s).keys()[0])]) found_worse = True break if found_worse: pi_tilde = self._solver.solve(model) chance = min(1, s_r_prior.prior(pi_tilde, R_tilde) / s_r_prior.prior(pi, R)) if random.random() < chance: pi = pi_tilde R = R_tilde else: chance = min(1, s_r_prior.prior(pi, R_tilde) / s_r_prior.prior(pi, R)) if random.random() < chance: R = R_tilde model.reward_function = R return pi
def maxEntObjGradient(self, w, model, initial, mu_E, true_samples_len, sa_freq): if (self.Q_value == None): model.reward_function.params = w agent = self._solver.solve(model) #shouldn't be doing this! else: policy = {} for s in model.S(): actions = util.classes.NumMap() for a in model.A(s): actions[a] = self.Q_value[ (s,a) ] policy[s] = actions.argmax() agent = mdp.agent.MapAgent(policy) samples = self.generate_samples(model, agent, initial, true_samples_len) _mu = self.feature_expectations(model, samples) print(w, mu_E - _mu) return -( mu_E - _mu)
def solve(self, model): '''Returns a map of (state, action) => q-value determined by this solver''' Q = util.classes.NumMap() for i in range(self._max_iter): Q = self.iter(model, Q) returnQ = util.classes.NumMap() V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 for (sa, value) in Q.iteritems(): returnQ[sa] = value - V[sa[0]] return returnQ
def solve(self, model, initial, true_samples, other_policy): ''' Returns a pair (agent, weights) where the agent attempts to generalize from the behavior observed in samples and weights is what was combined with the MDP to generate agent. model: an MDP with a linear reward functions. Parameters WILL be overwritten. initial: initial distribution over states samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal policy. ''' # Initial weight vector # w_0 = model.feature_function.params self.other_policy = other_policy self.full_initial = util.classes.NumMap() for s in model.S(): self.full_initial[s] = 1.0 self.full_initial = self.full_initial.normalize() # Compute feature expectations of agent = mu_E from samples mu_E = self.feature_expectations2(model, true_samples) print("True Samples", mu_E) # Pick random policy pi^(0) agent = mdp.agent.RandomAgent( model.A() ) # Calculate feature expectations of pi^(0) = mu^(0) samples = self.generate_samples(model, agent, initial, len(true_samples[0])) mu = self.feature_expectations(model, samples ) # mu = self.feature_expectations2(model, initial, agent ) lastT = 0 for i in range(self._max_iter): # Perform projections to new weights w^(i) if i == 0: mu_bar = mu else: mmmb = mu - mu_bar mu_bar = mu_bar + numpy.dot( mmmb, mu_E-mu_bar )/numpy.dot( mmmb,mmmb )*mmmb w = mu_E - mu_bar t = numpy.linalg.norm(mu_E - mu_bar) w[0] = abs(w[0]) print(w) model.reward_function.params = w print 'IRLApproxSolver Iteration #{},t = {:4.4f}'.format(i,t) if t < self._epsilon: break if abs(t - lastT) < .000001: break lastT = t # Compute optimal policy used R(s,a) = dot( feature_f(s,a), w^(i) ) if (numpy.linalg.norm(mu) == 0): agent = mdp.agent.RandomAgent( model.A() ) else: agent = self._solver.solve(model) # Compute feature expectations of pi^(i) = mu^(i) samples = self.generate_samples(model, agent, initial, len(true_samples[0])) mu = self.feature_expectations(model, samples) print(mu) # mu = self.feature_expectations2(model, initial, agent) # Restore initial weight vector # model.feature_function.params = w_0 return (agent, w)
def stupidPythonIdiots(): global resetPub global perceptPub global calcPub global mdpId global states global actions print(mdpId) rospy.wait_for_service('irlsimulate') initService() initNode() print(mdpId) p_fail = 0.05 longHallway = 10 shortSides = 4 patrolAreaSize = longHallway + shortSides + shortSides observableStateLow = 7 observableStateHigh = 8 # calculate farness for each node in the patrolled area farness = np.zeros(patrolAreaSize) for i in range(patrolAreaSize): sum = 0 for j in range(patrolAreaSize): sum += abs(i - j) farness[i] = sum ## Create reward function reward = patrol.reward.PatrolReward(patrolAreaSize, farness, observableStateLow, observableStateHigh) reward_weights = np.zeros(reward.dim) reward_weights[0] = .2 reward_weights[1] = .35 reward_weights[2] = .45 reward_weights[3] = 0 reward_weights[4] = 0 reward.params = reward_weights ## Create Model model = patrol.model.PatrolModel(p_fail, longHallway, shortSides) model.reward_function = reward model.gamma = 0.999 states = model.S() actions = model.A() ## Create initial distribution initial = util.classes.NumMap() for s in model.S(): initial[s] = 1.0 initial = initial.normalize() ## Define feature function (approximate methods only) # feature_function = mdp.etc.StateActionFeatureFunction(model) # feature_function = mdp.etc.StateFeatureFunction(model) # feature_function = gridworld.etc.GWLocationFF(model) ## Define player # policy = mdp.agent.HumanAgent(model) opt_policy = mdp.solvers.ValueIteration(50).solve(model) j = 0 for (s, a, r) in mdp.simulation.simulate(model, opt_policy, initial, 68): if (s.location[0] < observableStateLow): pass elif (s.location[0] > observableStateHigh): pass else: perceptPub.publish( percept(mdpId=mdpId, state=stateToId(s), action=actionToId(a), time=j)) j += 1 centerObs = util.classes.NumMap() for s in model.S(): centerObs[s] = 0 if (s.location[0] == (observableStateLow + observableStateHigh) / 2): centerObs[s] = 1 centerObs = centerObs.normalize() s = mdpId calcPub.publish(String(s)) raw_input("Percepts Sent, Press Enter to continue...") policyPxy = rospy.ServiceProxy('irlpolicy', policy) est_p = policyPxy(policyRequest(mdpId)) est_policy = util.classes.NumMap() for (i, a) in enumerate(est_p.policy): est_policy[idToState(i)] = idToAction(a) mdp.etc.policy_report(opt_policy, est_policy, mdp.solvers.ExactPolicyEvaluator(), model, centerObs) for s in model.S(): print 's = %s, pi*(s) = %s, pi_E(s) = %s' % (s, opt_policy.actions(s), est_policy.actions(s)) print 'pi* and pi_E disagree on {} of {} states'.format( len([ s for s in model.S() if opt_policy.actions(s) != est_policy.actions(s) ]), len(model.S())) simulatePxy = rospy.ServiceProxy('irlsimulate', simulate) enc_policy = simulatePxy(simulateRequest(mdpId)).state_actions