def feature_expectations2(self, model, initial, agent): ''' compute mu = Phi*inv(I - gamma*P) where P(i,j) = P(s'=i|s=j,pi) = E_{a}[ P(s'=i|s=j,a) ] = sum_{a} P(a|s) P(s'=i|s=j,a) Phi(:,j) = phi(s=j) = E_{a}[ phi(s=j,a) ] = sum_{a} P(a|s) phi(s,a) mu(:,j) = mu(pi)(s=j) = E[sum_t gamma^t phi(s_t,a_t) | s_0=s] assumes agent chooses policy deterministically. ''' # Index states S = {} for (i, s) in enumerate(model.S()): S[s] = i #Initialize matrices ff = model.reward_function k = ff.dim n_S = len(S) Phi = np.zeros((k, n_S)) P = np.zeros((n_S, n_S)) # build Phi for s in S: Ta = agent.actions(s) for a in Ta: j = S[s] Phi[:, j] += ff.features(s, a) * Ta[a] # Build P for s in S: Ta = agent.actions(s) for a in Ta: Ts = model.T(s, a) for s_p in Ts: i = S[s_p] j = S[s] P[i, j] += Ts[s_p] * Ta[a] # Calculate mu mu = np.dot(Phi, np.linalg.pinv(np.eye(n_S) - model.gamma * P)) # Calculate E_{s0}[ phi(s) ] result = np.zeros(k) for s in initial: j = S[s] result += initial[s] * mu[:, j] return result
def feature_expectations2(self, model, initial, agent): ''' compute mu = Phi*inv(I - gamma*P) where P(i,j) = P(s'=i|s=j,pi) = E_{a}[ P(s'=i|s=j,a) ] = sum_{a} P(a|s) P(s'=i|s=j,a) Phi(:,j) = phi(s=j) = E_{a}[ phi(s=j,a) ] = sum_{a} P(a|s) phi(s,a) mu(:,j) = mu(pi)(s=j) = E[sum_t gamma^t phi(s_t,a_t) | s_0=s] assumes agent chooses policy deterministically. ''' # Index states S = {} for (i,s) in enumerate( model.S() ): S[s] = i #Initialize matrices ff = model.reward_function k = ff.dim n_S = len(S) Phi = np.zeros( (k,n_S) ) P = np.zeros( (n_S, n_S) ) # build Phi for s in S: Ta = agent.actions(s) for a in Ta: j = S[s] Phi[:,j] += ff.features(s,a)*Ta[a] # Build P for s in S: Ta = agent.actions(s) for a in Ta: Ts = model.T(s,a) for s_p in Ts: i = S[s_p] j = S[s] P[i,j] += Ts[s_p]*Ta[a] # Calculate mu mu = np.dot( Phi, np.linalg.pinv(np.eye(n_S) - model.gamma*P) ) # Calculate E_{s0}[ phi(s) ] result = np.zeros( k ) for s in initial: j = S[s] result += initial[s]*mu[:,j] return result
def lstdq_exact(self, samples, model, agent, feature_f): ''' Return w such that Phi*w = R + gamma*P*Phi*w by solving w = inv(A)*b where A = Phi'*Delta*(Phi-gamma*P*Phi) b = Delta*Phi*R Phi[i,:] = Phi( (s,a)=i ) P[i,j] = P( (s',a')=j | (s,a) = i ) R[i] = R( (s,a)=i ) Delta[i,i] = P( (s,a)=i ) in samples ''' # Make index k = feature_f.dim SA = {} for (i, sa) in enumerate(itertools.product(model.S(), model.A())): SA[sa] = i n_SA = len(SA) # initialize matrices Phi = np.zeros((n_SA, k)) Delta = np.zeros((n_SA, n_SA)) P = np.zeros((n_SA, n_SA)) R = np.zeros(n_SA) # Feature Matrix for sa in SA: i = SA[sa] Phi[i, :] = feature_f.features(*sa) # Transition Matrix for sa in SA: Ts = model.T(*sa) for s_p in Ts: Ta = agent.actions(s_p) for a_p in Ta: sa_p = (s_p, a_p) j = SA[sa_p] i = SA[sa] P[i, j] = Ta[a_p] * Ts[s_p] # Weighting Matrix delta = util.classes.NumMap() for (s, a, r, s_p) in samples: delta[(s, a)] += 1 delta = delta.normalize() for sa in SA: i = SA[sa] Delta[i, i] = delta[sa] # Reward Vector for sa in SA: i = SA[sa] R[i] = model.R(*sa) A = np.dot(np.dot(Phi.T, Delta), Phi - model.gamma * np.dot(Phi, P)) b = np.dot(np.dot(Phi.T, Delta), R) return np.dot(np.linalg.pinv(A), b)
def lstdq_exact(self, samples, model, agent, feature_f): ''' Return w such that Phi*w = R + gamma*P*Phi*w by solving w = inv(A)*b where A = Phi'*Delta*(Phi-gamma*P*Phi) b = Delta*Phi*R Phi[i,:] = Phi( (s,a)=i ) P[i,j] = P( (s',a')=j | (s,a) = i ) R[i] = R( (s,a)=i ) Delta[i,i] = P( (s,a)=i ) in samples ''' # Make index k = feature_f.dim SA = {} for (i,sa) in enumerate( itertools.product( model.S(), model.A() ) ): SA[sa] = i n_SA = len(SA) # initialize matrices Phi = np.zeros( (n_SA, k) ) Delta = np.zeros( (n_SA, n_SA) ) P = np.zeros( (n_SA, n_SA) ) R = np.zeros( n_SA ) # Feature Matrix for sa in SA: i = SA[sa] Phi[ i,: ] = feature_f.features(*sa) # Transition Matrix for sa in SA: Ts = model.T(*sa) for s_p in Ts: Ta = agent.actions(s_p) for a_p in Ta: sa_p = (s_p,a_p) j = SA[ sa_p ] i = SA[ sa ] P[i,j] = Ta[a_p]*Ts[s_p] # Weighting Matrix delta = util.classes.NumMap() for (s,a,r,s_p) in samples: delta[ (s,a) ] += 1 delta = delta.normalize() for sa in SA: i = SA[sa] Delta[i,i] = delta[sa] # Reward Vector for sa in SA: i = SA[sa] R[i] = model.R(*sa) A = np.dot( np.dot( Phi.T, Delta), Phi-model.gamma*np.dot(Phi,P) ) b = np.dot( np.dot( Phi.T, Delta), R ) return np.dot( np.linalg.pinv(A), b)
def iter(self, model, agent, V): VV = util.classes.NumMap() for s in model.S(): pi = agent.actions(s) vv = 0 for (a,t_pi) in pi.items(): v = model.R(s,a) T = model.T(s,a) v += model.gamma*sum( [t*V[s_prime] for (s_prime,t) in T.items()] ) vv += t_pi*v VV[s] = vv return VV
def iter(self, model, agent, V): VV = util.classes.NumMap() for s in model.S(): pi = agent.actions(s) vv = 0 for (a, t_pi) in pi.items(): v = model.R(s, a) T = model.T(s, a) v += model.gamma * sum( [t * V[s_prime] for (s_prime, t) in T.items()]) vv += t_pi * v VV[s] = vv return VV
def feature_expectations(self, model, initial, agent): ''' Calculate mu(s,a) = E[sum_t gamma^ phi(s_t,a_t) | s_0=s, a_0=a] via repeated applications of mu(s,a) <--- phi(s,a) + gamma*sum_s' P(s'|s,a) P(a'|s') mu(s',a') Then returns sum_s0 sum_a P(s0) P(a|s0) mu(s0,a). Assumes sup_{ (s,a) } ||phi(s,a)||_{\infty} <= 1.0 ''' ff = model.reward_function i = 0 # Initialize feature expectations mu = {} for s in model.S(): for a in model.A(s): mu[(s, a)] = numpy.zeros(ff.dim) # Until error is less than 1% (assuming ||phi(s,a)||_{inf} <= 1 for all (s,a) ) # mu(s,a) = phi(s,a) + gamma*sum_{s'} P(s'|s,a) *sum_{a'} P(a'|s') mu(s',a') while model.gamma**i >= 0.01: i += 1 mu2 = {} for s in model.S(): for a in model.A(s): v = ff.features(s, a) for (s_prime, t_s) in model.T(s, a).items(): for (a_prime, t_a) in agent.actions(s_prime).items(): v += model.gamma * t_s * t_a * mu[(s_prime, a_prime)] mu2[(s, a)] = v mu = mu2 result = numpy.zeros(ff.dim) # result = sum_{s} sum_{a} P(s)*P(a|s)*mu(s,a) for s in initial: pi = agent.actions(s) for a in pi: result += initial[s] * pi[a] * mu[(s, a)] return result
def feature_expectations(self, model, initial, agent): ''' Calculate mu(s,a) = E[sum_t gamma^ phi(s_t,a_t) | s_0=s, a_0=a] via repeated applications of mu(s,a) <--- phi(s,a) + gamma*sum_s' P(s'|s,a) P(a'|s') mu(s',a') Then returns sum_s0 sum_a P(s0) P(a|s0) mu(s0,a). Assumes sup_{ (s,a) } ||phi(s,a)||_{\infty} <= 1.0 ''' ff = model.reward_function i = 0 # Initialize feature expectations mu = {} for s in model.S(): for a in model.A(s): mu[ (s,a) ] = numpy.zeros( ff.dim ) # Until error is less than 1% (assuming ||phi(s,a)||_{inf} <= 1 for all (s,a) ) # mu(s,a) = phi(s,a) + gamma*sum_{s'} P(s'|s,a) *sum_{a'} P(a'|s') mu(s',a') while model.gamma**i >= 0.01: i += 1 mu2 = {} for s in model.S(): for a in model.A(s): v = ff.features(s,a) for (s_prime,t_s) in model.T(s,a).items(): for (a_prime,t_a) in agent.actions(s_prime).items(): v += model.gamma*t_s*t_a*mu[ (s_prime, a_prime) ] mu2[ (s,a) ] = v mu = mu2 result = numpy.zeros( ff.dim ) # result = sum_{s} sum_{a} P(s)*P(a|s)*mu(s,a) for s in initial: pi = agent.actions(s) for a in pi: result += initial[s]*pi[a]*mu[ (s,a) ] return result
def iter(self, model, agent, V): VV = util.classes.NumMap() delta = 0 for s in model.S(): pi = agent.actions(s) vv = 0 for (a, t_pi) in pi.items(): v = model.R(s, a) T = model.T(s, a) v += model.gamma * sum( [t * V[s_prime] for (s_prime, t) in T.items()]) vv += t_pi * v VV[s] = vv if (s in V): delta = max(delta, abs(V[s] - VV[s])) else: delta = max(delta, abs(VV[s])) return (VV, delta)
def lstd_exact(self, samples, model, feature_f, gamma, agent): ''' Use least squares to estimate V^{\pi} ~~ Phi*w with w = inv(A)*b where A = Phi'*Delta*(Phi-gamma*P*Phi) b = Phi'*R Phi[i,:] = phi(s=i) Delta[i,i] = P(s=i) according to stationary distr of samples P[i,j] = P(s'=j|s=i) = sum_a P(a|s) P(s'|s,a) R = E[R(s)] = sum_a P(a|s) R(s,a) ''' k = feature_f.dim # Make index S = {} for (i, s) in enumerate(model.S()): S[s] = i n_S = len(S) # Initialize Matrices Phi = np.zeros((n_S, k)) Delta = np.zeros((n_S, n_S)) P = np.zeros((n_S, n_S)) R = np.zeros(n_S) # Phi for s in S: i = S[s] Phi[i, :] = feature_f.features(s) # P for s in S: Ta = agent.actions(s) for a in model.A(s): Ts = model.T(s, a) for s_p in Ts: i = S[s] j = S[s_p] P[i, j] += Ta[a] * Ts[s_p] # R for s in S: Ta = agent.actions(s) for a in Ta: i = S[s] R[i] += model.R(s, a) * Ta[a] # Delta delta = util.classes.NumMap() for (s, a, r, s_p) in samples: delta[s] += 1 delta = delta.normalize() for s in delta: i = S[s] Delta[i, i] = delta[s] # Actual computation A = np.dot(np.dot(Phi.T, Delta), Phi - gamma * np.dot(P, Phi)) b = np.dot(Phi.T, np.dot(Delta, R)) w = np.dot(np.linalg.pinv(A), b) V = util.classes.NumMap() for s in model.S(): V[s] = np.dot(feature_f.features(s), w) return V
def evaluate_policy(self, model, agent): ''' Use linear algebra to solve Q = R + gamma*P*PI*Q where R is (m*n x 1) P is (m*n x n) PI is (n x m*n) with n copies of P(a=i|s=j) along each row Q is (m*n x 1) m = number of actions n = number of states ''' # State + Actions S = list(model.S()) A = list(model.A()) SA = [] for s in S: for a in A: SA.append((s, a)) S_dict = {} for (i, s) in enumerate(S): S_dict[s] = i A_dict = {} for (j, a) in enumerate(A): A_dict[a] = j SA_dict = {} for (i, s) in enumerate(S): for (j, a) in enumerate(A): SA_dict[(s, a)] = i * len(A) + j gamma = model.gamma (n, m) = (len(S), len(A)) R = np.zeros(m * n) P = np.zeros([m * n, n]) PI = np.zeros([n, m * n]) # Fill R for ((s, a), i) in SA_dict.items(): R[i] = model.R(s, a) # Fill P for ((s, a), i) in SA_dict.items(): T = model.T(s, a) for (s2, p) in T.items(): j = S_dict[s2] P[i, j] = p # Fill PI pis = {} for s in S: pi = agent.actions(s) for a in A: pis[(s, a)] = pi[a] for (i, s) in enumerate(S): for (j, (s_p, a)) in enumerate(SA): if s_p != s: continue PI[i, j] = pis[(s, a)] # Solve Q = R + gamma*P*PI*Q Q = numpy.linalg.solve(np.eye(n * m) - gamma * np.dot(P, PI), R) # Build V = max_{A} Q(s,a) V = util.classes.NumMap() for (i, s) in enumerate(S): acts = util.classes.NumMap() for a in A: acts[a] = Q[SA_dict[(s, a)]] V[s] = acts.max() return V
def lstd_exact(self, samples, model, feature_f, gamma, agent): ''' Use least squares to estimate V^{\pi} ~~ Phi*w with w = inv(A)*b where A = Phi'*Delta*(Phi-gamma*P*Phi) b = Phi'*R Phi[i,:] = phi(s=i) Delta[i,i] = P(s=i) according to stationary distr of samples P[i,j] = P(s'=j|s=i) = sum_a P(a|s) P(s'|s,a) R = E[R(s)] = sum_a P(a|s) R(s,a) ''' k = feature_f.dim # Make index S = {} for (i,s) in enumerate(model.S()): S[s] = i n_S = len(S) # Initialize Matrices Phi = np.zeros( (n_S,k) ) Delta = np.zeros( (n_S,n_S) ) P = np.zeros( (n_S, n_S) ) R = np.zeros( n_S ) # Phi for s in S: i = S[s] Phi[i,:] = feature_f.features(s) # P for s in S: Ta = agent.actions(s) for a in model.A(s): Ts = model.T(s,a) for s_p in Ts: i = S[s] j = S[s_p] P[i,j] += Ta[a]*Ts[s_p] # R for s in S: Ta = agent.actions(s) for a in Ta: i = S[s] R[i] += model.R(s,a)*Ta[a] # Delta delta = util.classes.NumMap() for (s,a,r,s_p) in samples: delta[s] += 1 delta = delta.normalize() for s in delta: i = S[s] Delta[i,i] = delta[s] # Actual computation A = np.dot( np.dot( Phi.T,Delta ), Phi - gamma*np.dot(P,Phi) ) b = np.dot( Phi.T, np.dot(Delta, R) ) w = np.dot( np.linalg.pinv(A),b ) V = util.classes.NumMap() for s in model.S(): V[s] = np.dot( feature_f.features(s), w) return V
def evaluate_policy(self, model, agent): ''' Use linear algebra to solve Q = R + gamma*P*PI*Q where R is (m*n x 1) P is (m*n x n) PI is (n x m*n) with n copies of P(a=i|s=j) along each row Q is (m*n x 1) m = number of actions n = number of states ''' # State + Actions S = list( model.S() ) A = list( model.A() ) SA = [] for s in S: for a in A: SA.append((s,a)) S_dict = {} for (i,s) in enumerate(S): S_dict[s] = i A_dict = {} for (j,a) in enumerate(A): A_dict[a] = j SA_dict = {} for (i,s) in enumerate(S): for (j,a) in enumerate(A): SA_dict[(s,a)] = i*len(A)+j gamma = model.gamma (n,m) = ( len(S), len(A) ) R = np.zeros( m*n ) P = np.zeros( [m*n,n]) PI= np.zeros( [n,m*n]) # Fill R for ((s,a),i) in SA_dict.items(): R[i] = model.R(s,a) # Fill P for ((s,a),i) in SA_dict.items(): T = model.T(s,a) for (s2,p) in T.items(): j = S_dict[s2] P[i,j] = p # Fill PI pis = {} for s in S: pi = agent.actions(s) for a in A: pis[(s,a)] = pi[a] for (i,s) in enumerate(S): for (j,(s_p,a)) in enumerate(SA): if s_p != s: continue PI[i,j] = pis[(s,a)] # Solve Q = R + gamma*P*PI*Q Q = numpy.linalg.solve(np.eye(n*m)-gamma*np.dot(P,PI), R) # Build V = max_{A} Q(s,a) V = util.classes.NumMap() for (i,s) in enumerate(S): acts = util.classes.NumMap() for a in A: acts[a] = Q[ SA_dict[(s,a)] ] V[s] = acts.max() return V