예제 #1
0
    def feature_expectations2(self, model, initial, agent):
        '''
        compute 
            mu = Phi*inv(I - gamma*P)
        where 
            P(i,j)     = P(s'=i|s=j,pi) = E_{a}[ P(s'=i|s=j,a) ] = sum_{a} P(a|s) P(s'=i|s=j,a)
            Phi(:,j)   = phi(s=j) = E_{a}[ phi(s=j,a) ] = sum_{a} P(a|s) phi(s,a)
            mu(:,j)    = mu(pi)(s=j) = E[sum_t gamma^t phi(s_t,a_t) | s_0=s]
        assumes agent chooses policy deterministically.
        '''
        # Index states
        S = {}
        for (i, s) in enumerate(model.S()):
            S[s] = i

        #Initialize matrices
        ff = model.reward_function
        k = ff.dim
        n_S = len(S)
        Phi = np.zeros((k, n_S))
        P = np.zeros((n_S, n_S))

        # build Phi
        for s in S:
            Ta = agent.actions(s)
            for a in Ta:
                j = S[s]
                Phi[:, j] += ff.features(s, a) * Ta[a]

        # Build P
        for s in S:
            Ta = agent.actions(s)
            for a in Ta:
                Ts = model.T(s, a)
                for s_p in Ts:
                    i = S[s_p]
                    j = S[s]
                    P[i, j] += Ts[s_p] * Ta[a]
        # Calculate mu
        mu = np.dot(Phi, np.linalg.pinv(np.eye(n_S) - model.gamma * P))

        # Calculate E_{s0}[ phi(s) ]
        result = np.zeros(k)
        for s in initial:
            j = S[s]
            result += initial[s] * mu[:, j]
        return result
 def feature_expectations2(self, model, initial, agent):
     '''
     compute 
         mu = Phi*inv(I - gamma*P)
     where 
         P(i,j)     = P(s'=i|s=j,pi) = E_{a}[ P(s'=i|s=j,a) ] = sum_{a} P(a|s) P(s'=i|s=j,a)
         Phi(:,j)   = phi(s=j) = E_{a}[ phi(s=j,a) ] = sum_{a} P(a|s) phi(s,a)
         mu(:,j)    = mu(pi)(s=j) = E[sum_t gamma^t phi(s_t,a_t) | s_0=s]
     assumes agent chooses policy deterministically.
     '''
     # Index states
     S = {}
     for (i,s) in enumerate( model.S() ):
         S[s] = i
     
     #Initialize matrices
     ff = model.reward_function
     k = ff.dim
     n_S = len(S)
     Phi = np.zeros( (k,n_S) )
     P = np.zeros( (n_S, n_S) )
     
     # build Phi
     for s in S:
         Ta = agent.actions(s)
         for a in Ta:
             j = S[s]
             Phi[:,j] += ff.features(s,a)*Ta[a]
     
     # Build P
     for s in S:
         Ta = agent.actions(s)
         for a in Ta:
             Ts = model.T(s,a)
             for s_p in Ts:
                 i = S[s_p]
                 j = S[s]
                 P[i,j] += Ts[s_p]*Ta[a]
     # Calculate mu
     mu = np.dot( Phi, np.linalg.pinv(np.eye(n_S) - model.gamma*P) )
     
     # Calculate E_{s0}[ phi(s) ]
     result = np.zeros( k )
     for s in initial:
         j = S[s]
         result += initial[s]*mu[:,j]
     return result
예제 #3
0
    def lstdq_exact(self, samples, model, agent, feature_f):
        '''
        Return w such that
            Phi*w = R + gamma*P*Phi*w
        by solving
            w = inv(A)*b
        where
            A = Phi'*Delta*(Phi-gamma*P*Phi)
            b = Delta*Phi*R
            Phi[i,:] = Phi( (s,a)=i )
            P[i,j] = P( (s',a')=j | (s,a) = i )
            R[i] = R( (s,a)=i )
            Delta[i,i] = P( (s,a)=i ) in samples
        '''
        # Make index
        k = feature_f.dim
        SA = {}
        for (i, sa) in enumerate(itertools.product(model.S(), model.A())):
            SA[sa] = i
        n_SA = len(SA)

        # initialize matrices
        Phi = np.zeros((n_SA, k))
        Delta = np.zeros((n_SA, n_SA))
        P = np.zeros((n_SA, n_SA))
        R = np.zeros(n_SA)

        # Feature Matrix
        for sa in SA:
            i = SA[sa]
            Phi[i, :] = feature_f.features(*sa)
        # Transition Matrix
        for sa in SA:
            Ts = model.T(*sa)
            for s_p in Ts:
                Ta = agent.actions(s_p)
                for a_p in Ta:
                    sa_p = (s_p, a_p)
                    j = SA[sa_p]
                    i = SA[sa]
                    P[i, j] = Ta[a_p] * Ts[s_p]
        # Weighting Matrix
        delta = util.classes.NumMap()
        for (s, a, r, s_p) in samples:
            delta[(s, a)] += 1
        delta = delta.normalize()
        for sa in SA:
            i = SA[sa]
            Delta[i, i] = delta[sa]
        # Reward Vector
        for sa in SA:
            i = SA[sa]
            R[i] = model.R(*sa)

        A = np.dot(np.dot(Phi.T, Delta), Phi - model.gamma * np.dot(Phi, P))
        b = np.dot(np.dot(Phi.T, Delta), R)
        return np.dot(np.linalg.pinv(A), b)
 def lstdq_exact(self, samples, model, agent, feature_f):
     '''
     Return w such that
         Phi*w = R + gamma*P*Phi*w
     by solving
         w = inv(A)*b
     where
         A = Phi'*Delta*(Phi-gamma*P*Phi)
         b = Delta*Phi*R
         Phi[i,:] = Phi( (s,a)=i )
         P[i,j] = P( (s',a')=j | (s,a) = i )
         R[i] = R( (s,a)=i )
         Delta[i,i] = P( (s,a)=i ) in samples
     '''
     # Make index
     k = feature_f.dim
     SA = {}
     for (i,sa) in enumerate( itertools.product( model.S(), model.A() ) ):
         SA[sa] = i
     n_SA = len(SA)
         
     # initialize matrices
     Phi = np.zeros( (n_SA, k) )
     Delta = np.zeros( (n_SA, n_SA) )
     P = np.zeros( (n_SA, n_SA) )
     R = np.zeros( n_SA )
     
     # Feature Matrix
     for sa in SA:
         i = SA[sa]
         Phi[ i,: ] = feature_f.features(*sa)
     # Transition Matrix
     for sa in SA:
         Ts = model.T(*sa)
         for s_p in Ts:
             Ta = agent.actions(s_p)
             for a_p in Ta:
                 sa_p = (s_p,a_p)
                 j = SA[ sa_p ]
                 i = SA[ sa ]
                 P[i,j] = Ta[a_p]*Ts[s_p]
     # Weighting Matrix
     delta = util.classes.NumMap()
     for (s,a,r,s_p) in samples:
         delta[ (s,a) ] += 1
     delta = delta.normalize()
     for sa in SA:
         i = SA[sa]
         Delta[i,i] = delta[sa]
     # Reward Vector
     for sa in SA:
         i = SA[sa]
         R[i] = model.R(*sa)
     
     A = np.dot( np.dot( Phi.T, Delta), Phi-model.gamma*np.dot(Phi,P) )
     b = np.dot( np.dot( Phi.T, Delta), R )
     return np.dot( np.linalg.pinv(A), b)
 def iter(self, model, agent, V):
     VV = util.classes.NumMap()
     for s in model.S():
         pi = agent.actions(s)
         vv = 0
         for (a,t_pi) in pi.items():
             v = model.R(s,a)
             T = model.T(s,a)
             v += model.gamma*sum( [t*V[s_prime] for (s_prime,t) in T.items()] )
             vv += t_pi*v
         VV[s] = vv
     return VV
예제 #6
0
 def iter(self, model, agent, V):
     VV = util.classes.NumMap()
     for s in model.S():
         pi = agent.actions(s)
         vv = 0
         for (a, t_pi) in pi.items():
             v = model.R(s, a)
             T = model.T(s, a)
             v += model.gamma * sum(
                 [t * V[s_prime] for (s_prime, t) in T.items()])
             vv += t_pi * v
         VV[s] = vv
     return VV
예제 #7
0
    def feature_expectations(self, model, initial, agent):
        '''
        Calculate mu(s,a) = E[sum_t gamma^ phi(s_t,a_t) | s_0=s, a_0=a] via repeated applications of
            mu(s,a) <--- phi(s,a) + gamma*sum_s' P(s'|s,a) P(a'|s') mu(s',a')
        Then returns sum_s0 sum_a P(s0) P(a|s0) mu(s0,a).
         
        Assumes sup_{ (s,a) } ||phi(s,a)||_{\infty} <= 1.0
        '''
        ff = model.reward_function
        i = 0
        # Initialize feature expectations
        mu = {}
        for s in model.S():
            for a in model.A(s):
                mu[(s, a)] = numpy.zeros(ff.dim)

        # Until error is less than 1% (assuming ||phi(s,a)||_{inf} <= 1 for all (s,a) )
        # mu(s,a) = phi(s,a) + gamma*sum_{s'} P(s'|s,a) *sum_{a'} P(a'|s') mu(s',a')
        while model.gamma**i >= 0.01:
            i += 1
            mu2 = {}
            for s in model.S():
                for a in model.A(s):
                    v = ff.features(s, a)
                    for (s_prime, t_s) in model.T(s, a).items():
                        for (a_prime, t_a) in agent.actions(s_prime).items():
                            v += model.gamma * t_s * t_a * mu[(s_prime,
                                                               a_prime)]
                    mu2[(s, a)] = v
            mu = mu2
        result = numpy.zeros(ff.dim)

        # result = sum_{s} sum_{a} P(s)*P(a|s)*mu(s,a)
        for s in initial:
            pi = agent.actions(s)
            for a in pi:
                result += initial[s] * pi[a] * mu[(s, a)]
        return result
 def feature_expectations(self, model, initial, agent):
     '''
     Calculate mu(s,a) = E[sum_t gamma^ phi(s_t,a_t) | s_0=s, a_0=a] via repeated applications of
         mu(s,a) <--- phi(s,a) + gamma*sum_s' P(s'|s,a) P(a'|s') mu(s',a')
     Then returns sum_s0 sum_a P(s0) P(a|s0) mu(s0,a).
      
     Assumes sup_{ (s,a) } ||phi(s,a)||_{\infty} <= 1.0
     '''
     ff = model.reward_function
     i = 0
     # Initialize feature expectations
     mu = {}
     for s in model.S():
         for a in model.A(s):
             mu[ (s,a) ] = numpy.zeros( ff.dim )
     
     # Until error is less than 1% (assuming ||phi(s,a)||_{inf} <= 1 for all (s,a) )
     # mu(s,a) = phi(s,a) + gamma*sum_{s'} P(s'|s,a) *sum_{a'} P(a'|s') mu(s',a')
     while model.gamma**i >= 0.01:
         i += 1
         mu2 = {}
         for s in model.S():
             for a in model.A(s):
                 v = ff.features(s,a)
                 for (s_prime,t_s) in model.T(s,a).items():
                     for (a_prime,t_a) in agent.actions(s_prime).items():
                         v += model.gamma*t_s*t_a*mu[ (s_prime, a_prime) ]
                 mu2[ (s,a) ] = v
         mu = mu2
     result = numpy.zeros( ff.dim )
     
     # result = sum_{s} sum_{a} P(s)*P(a|s)*mu(s,a)
     for s in initial:
         pi = agent.actions(s)
         for a in pi:
             result += initial[s]*pi[a]*mu[ (s,a) ]
     return result
    def iter(self, model, agent, V):
        VV = util.classes.NumMap()
        delta = 0
        for s in model.S():
            pi = agent.actions(s)
            vv = 0
            for (a, t_pi) in pi.items():
                v = model.R(s, a)
                T = model.T(s, a)
                v += model.gamma * sum(
                    [t * V[s_prime] for (s_prime, t) in T.items()])
                vv += t_pi * v
            VV[s] = vv
            if (s in V):
                delta = max(delta, abs(V[s] - VV[s]))
            else:
                delta = max(delta, abs(VV[s]))

        return (VV, delta)
예제 #10
0
    def lstd_exact(self, samples, model, feature_f, gamma, agent):
        '''
        Use least squares to estimate V^{\pi} ~~ Phi*w with
            w = inv(A)*b
        where
            A = Phi'*Delta*(Phi-gamma*P*Phi)
            b = Phi'*R
            Phi[i,:] = phi(s=i)
            Delta[i,i] = P(s=i) according to stationary distr of samples
            P[i,j] = P(s'=j|s=i) = sum_a P(a|s) P(s'|s,a)
            R = E[R(s)] = sum_a P(a|s) R(s,a)
        '''
        k = feature_f.dim

        # Make index
        S = {}
        for (i, s) in enumerate(model.S()):
            S[s] = i
        n_S = len(S)

        # Initialize Matrices
        Phi = np.zeros((n_S, k))
        Delta = np.zeros((n_S, n_S))
        P = np.zeros((n_S, n_S))
        R = np.zeros(n_S)

        # Phi
        for s in S:
            i = S[s]
            Phi[i, :] = feature_f.features(s)
        # P
        for s in S:
            Ta = agent.actions(s)
            for a in model.A(s):
                Ts = model.T(s, a)
                for s_p in Ts:
                    i = S[s]
                    j = S[s_p]
                    P[i, j] += Ta[a] * Ts[s_p]
        # R
        for s in S:
            Ta = agent.actions(s)
            for a in Ta:
                i = S[s]
                R[i] += model.R(s, a) * Ta[a]
        # Delta
        delta = util.classes.NumMap()
        for (s, a, r, s_p) in samples:
            delta[s] += 1
        delta = delta.normalize()
        for s in delta:
            i = S[s]
            Delta[i, i] = delta[s]

        # Actual computation
        A = np.dot(np.dot(Phi.T, Delta), Phi - gamma * np.dot(P, Phi))
        b = np.dot(Phi.T, np.dot(Delta, R))
        w = np.dot(np.linalg.pinv(A), b)

        V = util.classes.NumMap()
        for s in model.S():
            V[s] = np.dot(feature_f.features(s), w)
        return V
예제 #11
0
    def evaluate_policy(self, model, agent):
        '''
        Use linear algebra to solve
            Q = R + gamma*P*PI*Q
        where R  is (m*n x 1)
              P  is (m*n x n)
              PI is (n x m*n) with n copies of P(a=i|s=j) along each row
              Q  is (m*n x 1)
        m = number of actions
        n = number of states
        '''
        # State + Actions
        S = list(model.S())
        A = list(model.A())
        SA = []
        for s in S:
            for a in A:
                SA.append((s, a))

        S_dict = {}
        for (i, s) in enumerate(S):
            S_dict[s] = i

        A_dict = {}
        for (j, a) in enumerate(A):
            A_dict[a] = j

        SA_dict = {}
        for (i, s) in enumerate(S):
            for (j, a) in enumerate(A):
                SA_dict[(s, a)] = i * len(A) + j

        gamma = model.gamma
        (n, m) = (len(S), len(A))
        R = np.zeros(m * n)
        P = np.zeros([m * n, n])
        PI = np.zeros([n, m * n])

        # Fill R
        for ((s, a), i) in SA_dict.items():
            R[i] = model.R(s, a)

        # Fill P
        for ((s, a), i) in SA_dict.items():
            T = model.T(s, a)
            for (s2, p) in T.items():
                j = S_dict[s2]
                P[i, j] = p

        # Fill PI
        pis = {}
        for s in S:
            pi = agent.actions(s)
            for a in A:
                pis[(s, a)] = pi[a]
        for (i, s) in enumerate(S):
            for (j, (s_p, a)) in enumerate(SA):
                if s_p != s:
                    continue
                PI[i, j] = pis[(s, a)]

        # Solve Q = R + gamma*P*PI*Q
        Q = numpy.linalg.solve(np.eye(n * m) - gamma * np.dot(P, PI), R)

        # Build V = max_{A} Q(s,a)
        V = util.classes.NumMap()
        for (i, s) in enumerate(S):
            acts = util.classes.NumMap()
            for a in A:
                acts[a] = Q[SA_dict[(s, a)]]
            V[s] = acts.max()
        return V
 def lstd_exact(self, samples, model, feature_f, gamma, agent):
     '''
     Use least squares to estimate V^{\pi} ~~ Phi*w with
         w = inv(A)*b
     where
         A = Phi'*Delta*(Phi-gamma*P*Phi)
         b = Phi'*R
         Phi[i,:] = phi(s=i)
         Delta[i,i] = P(s=i) according to stationary distr of samples
         P[i,j] = P(s'=j|s=i) = sum_a P(a|s) P(s'|s,a)
         R = E[R(s)] = sum_a P(a|s) R(s,a)
     '''
     k = feature_f.dim
     
     # Make index
     S = {}
     for (i,s) in enumerate(model.S()):
         S[s] = i
     n_S = len(S)
     
     # Initialize Matrices
     Phi = np.zeros( (n_S,k) )
     Delta = np.zeros( (n_S,n_S) )
     P = np.zeros( (n_S, n_S) )
     R = np.zeros( n_S )
     
     # Phi
     for s in S:
         i = S[s]
         Phi[i,:] = feature_f.features(s)
     # P
     for s in S:
         Ta = agent.actions(s)
         for a in model.A(s):
             Ts = model.T(s,a)
             for s_p in Ts:
                 i = S[s]
                 j = S[s_p]
                 P[i,j] += Ta[a]*Ts[s_p]
     # R
     for s in S:
         Ta = agent.actions(s)
         for a in Ta:
             i = S[s]
             R[i] += model.R(s,a)*Ta[a]
     # Delta
     delta = util.classes.NumMap()
     for (s,a,r,s_p) in samples:
         delta[s] += 1
     delta = delta.normalize()
     for s in delta:
         i = S[s]
         Delta[i,i] = delta[s]
     
     # Actual computation
     A = np.dot( np.dot( Phi.T,Delta ), Phi - gamma*np.dot(P,Phi) )
     b = np.dot( Phi.T, np.dot(Delta, R) )
     w = np.dot( np.linalg.pinv(A),b )
     
     V = util.classes.NumMap()
     for s in model.S():
         V[s] = np.dot( feature_f.features(s), w)
     return V
     
         
         
 def evaluate_policy(self, model, agent):
     '''
     Use linear algebra to solve
         Q = R + gamma*P*PI*Q
     where R  is (m*n x 1)
           P  is (m*n x n)
           PI is (n x m*n) with n copies of P(a=i|s=j) along each row
           Q  is (m*n x 1)
     m = number of actions
     n = number of states
     '''
     # State + Actions
     S = list( model.S() )
     A = list( model.A() )
     SA = []
     for s in S:
         for a in A:
             SA.append((s,a))
     
     S_dict = {}
     for (i,s) in enumerate(S):
         S_dict[s] = i
     
     A_dict = {} 
     for (j,a) in enumerate(A):
         A_dict[a] = j
     
     SA_dict = {}
     for (i,s) in enumerate(S):
         for (j,a) in enumerate(A):
             SA_dict[(s,a)] = i*len(A)+j
     
     gamma = model.gamma
     (n,m) = ( len(S), len(A) )
     R = np.zeros( m*n )
     P = np.zeros( [m*n,n])
     PI= np.zeros( [n,m*n])        
     
     # Fill R
     for ((s,a),i) in SA_dict.items():
         R[i] = model.R(s,a)
     
     # Fill P
     for ((s,a),i) in SA_dict.items():
         T = model.T(s,a)
         for (s2,p) in T.items():
             j = S_dict[s2]
             P[i,j] = p
     
     # Fill PI
     pis = {}
     for s in S:
         pi = agent.actions(s)
         for a in A:
             pis[(s,a)] = pi[a]
     for (i,s) in enumerate(S):
         for (j,(s_p,a)) in enumerate(SA):
             if s_p != s:
                 continue
             PI[i,j] = pis[(s,a)]
     
     # Solve Q = R + gamma*P*PI*Q
     Q = numpy.linalg.solve(np.eye(n*m)-gamma*np.dot(P,PI), R) 
     
     # Build V = max_{A} Q(s,a)
     V = util.classes.NumMap()
     for (i,s) in enumerate(S):
         acts = util.classes.NumMap()
         for a in A:
             acts[a] = Q[ SA_dict[(s,a)] ]
         V[s] = acts.max()
     return V