예제 #1
0
 def convert(self, observations=True):
     if self.verbose:
         if observations:
             print 'Number of features:', 5 * len(self.env._obstypes)
     initSet = [self.env._initstate]
     self.states = sorted(flood(self.tryMoves, None, initSet))
     dim = len(self.states)        
     if self.verbose:
         print 'Actual states:', dim
         print 'Non-zero rewards:', self.rewards
         print 'Initial state', initSet[0]
     Ts = [zeros((dim, dim)) for _ in self.env._actionset]
     R = zeros(dim)
     statedic = {}
     actiondic = {}        
     for si, pos in enumerate(self.states):
         statedic[pos] = si
     for ai, a in enumerate(self.env._actionset):
         actiondic[a] = ai
     for pos, val in self.rewards.items():
         R[statedic[pos]] += val
     for pos, a, dest in self.sas_tuples:
         ai = actiondic[a]
         si = statedic[pos]
         di = statedic[dest]
         Ts[ai][si, di] += 1. / self.avgOver
     if self.verbose:
         print 'Built Ts.'
     for T in Ts:
         for ti, row in enumerate(T):
             if sum(row) > 0:  
                 row /= sum(row)
             else:
                 row[ti] = 1
     if self.verbose:
         print 'Normalized Ts.'
     if observations:
         # one observation for current position and each of the 4 neighbors.
         fMap = zeros((len(self.env._obstypes) * 5, dim))
         for si, state in enumerate(self.states):
             fMap[:, si] = self.env.getSensors(state)                
         if self.verbose:
             print 'Built features.'        
         return Ts, R, fMap
     else:
         return Ts, R
예제 #2
0
 def convert(self, observations=True):
     if self.verbose:
         if observations:
             print 'Number of features:', 5 * len(self.env._obstypes)
     initSet = [self.env._initstate]
     self.states = sorted(flood(self.tryMoves, None, initSet))
     dim = len(self.states)
     if self.verbose:
         print 'Actual states:', dim
         print 'Non-zero rewards:', self.rewards
         print 'Initial state', initSet[0]
     Ts = [zeros((dim, dim)) for _ in self.env._actionset]
     R = zeros(dim)
     statedic = {}
     actiondic = {}
     for si, pos in enumerate(self.states):
         statedic[pos] = si
     for ai, a in enumerate(self.env._actionset):
         actiondic[a] = ai
     for pos, val in self.rewards.items():
         R[statedic[pos]] += val
     for pos, a, dest in self.sas_tuples:
         ai = actiondic[a]
         si = statedic[pos]
         di = statedic[dest]
         Ts[ai][si, di] += 1. / self.avgOver
     if self.verbose:
         print 'Built Ts.'
     for T in Ts:
         for ti, row in enumerate(T):
             if sum(row) > 0:
                 row /= sum(row)
             else:
                 row[ti] = 1
     if self.verbose:
         print 'Normalized Ts.'
     if observations:
         # one observation for current position and each of the 4 neighbors.
         fMap = zeros((len(self.env._obstypes) * 5, dim))
         for si, state in enumerate(self.states):
             fMap[:, si] = self.env.getSensors(state)
         if self.verbose:
             print 'Built features.'
         return Ts, R, fMap
     else:
         return Ts, R
예제 #3
0
    def convert_task_to_mdp(self):
        # Finds all states, all the while logging transitions and rewards
        self.states = sorted(
            flood(self.get_neighbors, None, [self.env.init_game_state]))
        state_dict = {
            state: state_i
            for state_i, state in enumerate(self.states)
        }

        # Reward function R(s')
        R = np.fromiter((self.rewards[state] for state in self.states),
                        dtype=np.double)

        # Transition matrix A x S x S
        T = np.zeros((self.env.numActions, len(self.states), len(self.states)))

        for state, action_i, next_state in self.transitions:
            # Careful, states are actual states but action_i is an index
            T[action_i, state_dict[state], state_dict[next_state]] = 1

        return T, R