def __init__(self, m, n):
        '''

        '''
        self.env = GridWorld(m, n)
        self.q = self.init_q()
        self.current = [1, 0]
        self.gamma = 0.9
Пример #2
0
    def calculate_step(self, x, y, policy):
        gamma = 1
        state = np.array([x, y])
        forward_step = GridWorld.step(state, policy[x, y])
        if (policy[x, y] == 'u' or policy[x, y] == 'd'):
            side_step_one = GridWorld.step(state, 'l')
            side_step_two = GridWorld.step(state, 'r')
        elif (policy[x, y] == 'l' or policy[x, y] == 'r'):
            side_step_one = GridWorld.step(state, 'u')
            side_step_two = GridWorld.step(state, 'd')

        reward = -0.04 + gamma * (0.8 * self.world[forward_step] +
                                  0.1 * self.world[side_step_one] +
                                  0.1 * self.world[side_step_two])
        return reward
Пример #3
0
class mdp:
    def __init__ (self, path):
        self.world = GridWorld(path)
        self.directions = self.world.get_directions()
    def create_policy(self):
        #TODO
        directionsTranslator = {0:'u', 1: 'd', 2: 'l', 3: 'r'}
        policy = np.random.randint(0,len(self.directions),self.world.get_size())
        outPol = np.chararray(self.world.get_size())
        for i in range(0,policy.max()+1):
            outPol[policy == i] = directionsTranslator[i]
        outPol[self.world.free_map()] = 'X'
        return outPol.decode('UTF-8')

    def calculate_step(self,x,y,policy):
        gamma = 1
        state = np.array([x,y])
        forward_step = self.world.step(state,policy[x,y])
        if(policy[x,y] == 'u' or policy[x,y] == 'd'):
            side_step_one = self.world.step(state,'l')
            side_step_two = self.world.step(state,'r')
        elif(policy[x,y] == 'l' or policy[x,y] == 'r'):
            side_step_one = self.world.step(state,'u')
            side_step_two = self.world.step(state,'d')
           
        reward = -0.04 +gamma*(0.8* self.world[forward_step]+0.1*self.world[side_step_one]+0.1*self.world[side_step_two])
        return reward
    
    def evaluate_policy(self,policy, max_steps):
        evaluated_policy = np.zeros(shape = policy.shape)
        n = 0
        while(n < max_steps):
            M = self.world.get_size()
            print(M)
            for x in range(M[0]):
                for y in range(M[1]):
                    evaluated_policy[x,y] = mdp.calculate_step(self,x,y,policy) 
            n += 1
        return evaluated_policy
    
    def iterate_policy(self):
        #TODO
        a=5

    def print_solutions(self):
        #TODO
        a=5
class QLearning(object):
    '''

    '''
    def __init__(self, m, n):
        '''

        '''
        self.env = GridWorld(m, n)
        self.q = self.init_q()
        self.current = [1, 0]
        self.gamma = 0.9

    def init_q(self):
        '''

        :return:
        '''
        q = dict()
        for i in range(self.env.grid.shape[0]):
            q[i] = {}
            for j in range(self.env.grid.shape[1]):
                q[i][j] = {}
                for a in self.env.actions:
                    q[i][j][a] = 0
        return q

    def learn(self):
        '''

        :return:
        '''
        for i in range(500):
            a = np.random.choice(self.env.actions.__len__(), 1)[0]
            next, reward = self.env.step(self.current, a)
            self.q[self.current[0]][
                self.current[1]][a] = reward + self.gamma * max(
                    self.q[next[0]][next[1]].values())
            self.current = next
Пример #5
0
 def create_policy(self):
     #TODO
     directions = GridWorld.getSize(self.world)
     print(directions)
Пример #6
0
 def __init__(self, path):
     self.world = GridWorld(path)
Пример #7
0
 def __init__ (self, path):
     self.world = GridWorld(path)
     self.directions = self.world.get_directions()