示例#1
0
    def agent_start(self, state):
        """
        Arguments: state - numpy array
        Returns: action - integer
        """
        self.Z = np.zeros(self.memorySize)

        F0 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[0])
        F1 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[1])
        F2 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[2])

        Q0 = 0
        Q1 = 0
        Q2 = 0
        for i in F0:
            Q0 += self.weight[i]
        for i in F1:
            Q1 += self.weight[i]
        for i in F2:
            Q2 += self.weight[i]
        #Q0  = self.weight[F0].sum()
        #Q1  = self.weight[F1].sum()
        #Q2  = self.weight[F2].sum()

        self.state = state

        A = np.argmax([Q0, Q1, Q2])
        temp = [F0,F1,F2]
        self.F = temp[A]
        return A
示例#2
0
def question_3():
    # Specify hyper-parameters
    agent = Agent()
    environment = Environment()
    rlglue = RLGlue(environment, agent)
    num_episodes = 1000
    num_runs = 1
    max_eps_steps = 1000000
    for _ in range(num_runs):
        rlglue.rl_init()
        i = 0
        for i in range(num_episodes):
            rlglue.rl_episode(max_eps_steps)
            print(i)
    fout = open('value', 'w')
    steps = 50
    w, iht = rlglue.rl_agent_message("ValueFunction")
    Q = np.zeros([steps, steps])
    for i in range(steps):
        for j in range(steps):
            values = []
            for a in range(3):
                value = 0
                for index in tiles(iht, 8, [
                        8 * (-1.2 + (i * 1.7 / steps)) / 1.7, 8 *
                    (-0.07 + (j * 0.14 / steps)) / 0.14
                ], [a]):
                    value -= w[index]
                values.append(value)
            height = max(values)
            fout.write(repr(height) + ' ')
            Q[j][i] = height
        fout.write('\n')
    fout.close()
    np.save("value", Q)
示例#3
0
    def agent_message(self, in_message):
        """
        Arguments: in_message - string
        Returns: The value function as a list.
        This function is complete. You do not need to add code here.
        """
        if in_message == 'Q3':
            values = np.zeros((50,50))
            steps = 50
            numActions = 3
            for i in range(steps):
                for j in range(steps):
                    Q = np.zeros(3)
                    for a in range(numActions):

                        inds = tile.tiles(self.iht,8,[(-1.2 + (i * 1.7 / steps)),(-0.07 + (j * 0.14 / steps))], [a])

                        total_weight = np.sum(self.weight[inds])
                        Q[a] = total_weight


                    a = np.max(Q)
                    values[i][j] = -a

            np.save('values', values)
        else:
            return "I dont know how to respond to this message!!"
示例#4
0
 def mytiles(self, x, x_dot, action):
     scaling_factor_x = 8 / (1.7)
     scaling_factor_xdot = 8 / (0.14)
     x = x + 1.2
     x_dot = x_dot + 0.07
     x = x * scaling_factor_x
     x_dot = x_dot * scaling_factor_xdot
     return tiles(self.iht, self.offset_t, [x, x_dot], [action])
示例#5
0
 def getActiveTiles(self, state):
     tileVectors = []
     scaleX = 0.5 + 1.2
     scaleY = 0.07 + 0.07
     tile = self.getTileScale(state)
     for i in range(0, 3):
         selectedTiles = tiles(self.iht, 8, tile, [i])
         tileVectors.append(selectedTiles)
     return tileVectors
示例#6
0
    def agent_step(self, reward, state):
        """
        Arguments: reward - floting point, state - numpy array
        Returns: action - integer
        """

        delta = reward
        F = self.F

        for i in F:
            delta = delta - self.weight[i]
            self.Z[i] = 1

        F0 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[0])
        F1 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[1])
        F2 = tile.tiles(self.iht,8,[8*state[0]/(0.5+1.2),8*state[1]/(0.07+0.07)],[2])

        Q0 = 0
        Q1 = 0
        Q2 = 0
        for i in F0:
            Q0 += self.weight[i]
        for i in F1:
            Q1 += self.weight[i]
        for i in F2:
            Q2 += self.weight[i]


        #Q0  = self.weight[F0].sum()
        #Q1  = self.weight[F1].sum()
        #Q2  = self.weight[F2].sum()

        A = np.argmax([Q0, Q1, Q2])
        temp = [F0,F1,F2]
        for i in temp[A]:
            delta += self.gamma*self.weight[i]
        self.weight += self.alpha*delta*self.Z
        self.Z *= self.gamma*self.lam

        self.F = temp[A]

        return A
示例#7
0
 def get_action(self, state):
     if np.random.random() < self.eps:
         return np.random.choice(self.actions)
     else:
         value = np.asarray([
             self.w[tile3.tiles(self.iht, self.num_tiling,
                                state * self.reshape, [a])].sum()
             for a in self.actions
         ],
                            dtype=float)
         return self.actions[np.random.choice(
             np.flatnonzero(value == value.max()))]
示例#8
0
 def agent_end(self, reward):
     """
     Run when the agent terminates.
     Args:
         reward (float): the reward the agent received for entering the
             terminal state.
     """
     features = tile3.tiles(self.iht, self.num_tiling,
                            self.s_prev * self.reshape, [self.a_prev])
     delta = reward - self.w[features].sum()
     self.replacing_trace[features] = 1
     self.w += self.alpha * delta * self.replacing_trace
 def generateFeatures(self, observation, action):
     # print(state[1])
     # print(action)
     if (observation[0], observation[1], action) not in self.features:
         positionScale = 8 / (0.5 + 1.2)
         velocityScale = 8 / (0.07 + 0.07)
         self.features[observation[0], observation[1],
                       action] = tiles(self.iht, NUM_TILINGS, [
                           observation[0] * positionScale,
                           observation[1] * velocityScale
                       ], [action])
     else:
         pass
示例#10
0
 def agent_step(self, reward, state):
     """
     A step taken by the agent.
     Args:
         reward (float): the reward received for taking the last action taken
         state (state observation): The agent's current state
     Returns:
         The action the agent is taking.
     """
     features = tile3.tiles(self.iht, self.num_tiling,
                            self.s_prev * self.reshape, [self.a_prev])
     self.replacing_trace[features] = 1
     action = self.get_action(state)
     new_features = tile3.tiles(self.iht, self.num_tiling,
                                state * self.reshape, [action])
     delta = reward - self.w[features].sum(
     ) + self.gamma * self.w[new_features].sum()
     self.w += self.alpha * delta * self.replacing_trace
     self.replacing_trace *= self.gamma * self.lam
     self.s_prev = state.copy()
     self.a_prev = action
     return action
示例#11
0
 def compute_for_3d_plot(self):
     steps = 50
     values = np.zeros((steps, steps))
     i_values = np.linspace(-1.2, 0.5, steps)
     j_values = np.linspace(-0.07, 0.07, steps)
     for i in range(steps):
         for j in range(steps):
             values[i, j] = -max([
                 self.w[tile3.tiles(
                     self.iht, self.num_tiling,
                     np.array([i_values[i], j_values[j]]) * self.reshape,
                     [a])].sum() for a in self.actions
             ])
     return [i_values, j_values, values]
示例#12
0
 def plot3DGraph(self):
     step_size = 50
     # scaleX = 0.5 + 1.2
     # scaleY = 0.07 + 0.07
     f = open('plotValues.txt', 'w')
     for i in range(step_size):
         pos = -1.2 + (i * 1.7 / step_size)
         for j in range(step_size):
             vel = -0.07 + (j * 0.14 / step_size)
             values = []
             for a in range(0, 3):
                 tile = self.getTileScale([pos, vel])
                 stateVector = np.zeros(2048)
                 # [(-1.2+(i*1.7/step_size))*scaleX,(-0.07+(j*0.14/step_size))*scaleY]
                 inds = tiles(self.iht, 8, tile, [a])
                 for element in inds:
                     stateVector[element] = 1
                 values.append(np.dot(stateVector, self.weights))
             height = max(values)
             f.write(repr(-height) + " ")
         f.write("\n")
     f.close()
     plotGraph3D()
示例#13
0
def mytiles(position, velocity, action=[]):
    scale_P = 5 / 1.7
    scale_V = 5 / .14
    return tiles(iht, numTilings, list(
        (position * scale_P, velocity * scale_V)), action)
    def choose(self, state):

        # find the value of each action
        temp1 = tiles(self.iht, 8,
                      [state[0] * self.pos_scale, state[1] * self.vel_scale],
                      [0])
        a = 0
        b = 0
        c = 0
        d = [a, b, c]

        for i in temp1:
            a = a + self.w[i]

        temp2 = tiles(self.iht, 8,
                      [state[0] * self.pos_scale, state[1] * self.vel_scale],
                      [1])
        for i in temp2:
            b = b + self.w[i]

        temp3 = tiles(self.iht, 8,
                      [state[0] * self.pos_scale, state[1] * self.vel_scale],
                      [2])
        for i in temp3:
            c = c + self.w[i]

        all = [temp1, temp2, temp3]

        #       Tie breaking if the actions have same values and return a random one from the ties
        if (a == b) or (b == c) or (a == c):
            # if their all equal
            if (a == b) and (b == c):
                r = np.random.randint(0, 3)

                return r, np.array(all[r])

            # if a=b and c is less than both
            if (a == b) and c < a:
                r = np.random.randint(0, 2)

                return r, np.array(all[r])
            # if c is the largest
            elif (a == b) and a < c:
                return 2, np.array(all[2])

            # if b=c and a < c
            if (b == c) and a < c:
                r = np.random.randint(1, 3)

                return r, np.array(all[r])
            # if a is the largest
            elif (b == c) and c < a:
                return 0, np.array(all[0])

            #  if a == c

            if b > a:
                return 1, np.array(all[1])
            else:
                temp = [0, 2]
                r = temp[np.random.randint(0, 2)]
                return r, np.array(all[r])

        # return max action and its weight vector index
        temp = [a, b, c]
        max = a
        idx = 0
        for i in range(1, 3):
            if temp[i] > max:
                max = temp[i]
                idx = i

        return idx, np.array(all[idx])
示例#15
0
def mytiles(x, y):
    scaleFactor = 20/(2*np.pi)
    return tiles(iht, numTilings, list((x*scaleFactor,y*scaleFactor)))
示例#16
0
 def mytiles(self, position, action):
     a = [
         self.numTilings * position[0] / 1.7,
         self.numTilings * position[1] / 0.14
     ]
     return tiles(self.iht, self.numTilings, a, [action])