示例#1
0
 def test_multiple_inserts(self):
     indices = tc.tiles(self.iht, 8, [3.6, 7.21])
     self.assertEquals(indices, [0, 1, 2, 3, 4, 5, 6, 7])
     indices = tc.tiles(self.iht, 8, [3.7, 7.21])
     self.assertEquals(indices, [0, 1, 2, 8, 4, 5, 6, 7])
     indices = tc.tiles(self.iht, 8, [4, 7])
     self.assertEquals(indices, [9, 10, 11, 8, 4, 12, 6, 7])
     indices = tc.tiles(self.iht, 8, [-37.2, 7])
     self.assertEquals(indices, [13, 14, 15, 16, 17, 18, 19, 20])
def get_tiles(iht, num_tilings, tile_scale, state, action=None):
    # # use tile coding to construct the feature vector
    if action == None:
        mytiles = tiles(iht, num_tilings, state * tile_scale)
    else:
        mytiles = tiles(iht, num_tilings, state * tile_scale, [action])

    # feature_vec = self.feature_vec_init
    # feature_vec[mytiles] = 1
    return mytiles
def agent_step(
    reward, state
):  # returns NumPy array, reward: floating point, this_observation: NumPy array
    """
    Arguments: reward: floting point, state: integer
    Returns: action: floating point
    """

    global last_state, w, Value_func, z, last_action

    delta = reward

    x = 8 * state[0] / (0.5 + 1.2)
    xdot = 8 * state[1] / (0.07 + 0.07)
    current_state = [x, xdot]

    last_indices = tiles(iht, 8, last_state, [last_action])
    for i in last_indices:
        delta -= w[i]
        z[i] = 1

    feature1 = np.zeros(1944)
    feature_list1 = tiles(iht, 8, current_state, [0])

    feature2 = np.zeros(1944)
    feature_list2 = tiles(iht, 8, current_state, [1])

    feature3 = np.zeros(1944)
    feature_list3 = tiles(iht, 8, current_state, [2])

    for i in feature_list1:
        feature1[i] = 1
    for i in feature_list2:
        feature2[i] = 1
    for i in feature_list3:
        feature3[i] = 1

    v1 = np.dot(w, feature1)
    v2 = np.dot(w, feature2)
    v3 = np.dot(w, feature3)

    action = np.argmax([v1, v2, v3])
    indices = tiles(iht, 8, current_state, [action])
    for i in indices:
        delta += w[i]

    w += alpha * z * delta
    z = z * gamma * lam
    last_action = action
    last_state = current_state

    return action
示例#4
0
    def get_tiles(self, position, velocity):
        """
        Takes in a position and velocity from the mountaincar environment
        and returns a numpy array of active tiles.
        
        Arguments:
        position -- float, the position of the agent between -1.2 and 0.5
        velocity -- float, the velocity of the agent between -0.07 and 0.07
        returns:
        tiles - np.array, active tiles
        """

        POSITION_MIN = -1.2
        POSITION_MAX = 0.5
        VELOCITY_MIN = -0.07
        VELOCITY_MAX = 0.07

        position_scale = self.num_tiles / (POSITION_MAX - POSITION_MIN)
        velocity_scale = self.num_tiles / (VELOCITY_MAX - VELOCITY_MIN)

        tiles = tc.tiles(
            self.iht, self.num_tilings,
            [position * position_scale, velocity * velocity_scale])

        return np.array(tiles)
def mytiles(x, y, action):
    global numTilings, tile_width, iht
    scaleFactor1 = tile_width / (0.5 - (-1.2))  #scale factor for the position
    scaleFactor2 = tile_width / (0.07 -
                                 (-0.07))  #scale factor for the velocity
    return tiles(iht, numTilings, [x * scaleFactor1, y * scaleFactor2],
                 [action])
示例#6
0
 def feature(self, state, action):
     x, xdot = state
     a = action - 1
     idx = tiles3.tiles(self.iht,8,[8.0*x/(0.5+1.2), 8.0*xdot/(0.07+0.07)],[a])
     fea = np.zeros((2048,1),dtype=float)
     fea[idx] = 1.0
     return fea
示例#7
0
def q(hashtable, state, action, weights):
    active_tiles = tiles(
        hashtable, NUM_TILES,
        np.multiply(state,
                    [1.5, 1.5, 1.5, 1.5, 1.5 / 12.566371, 1.5 / 28.274334]),
        [action])
    return sum(weights[active_tiles])
示例#8
0
def agent_step(reward, state):
    global actions, old_state, old_action, iht, weights, z

    #tile-coding
    scaled_ns1 = 1. * NUM_TILES * (state[0] - POSITION[0]) / (POSITION[1] -
                                                              POSITION[0])
    scaled_ns2 = 1. * NUM_TILES * (state[1] - VELOCITY[0]) / (VELOCITY[1] -
                                                              VELOCITY[0])

    hash_s = old_state
    hash_ns = np.asarray(tiles(iht, NUM_TILINGS, [scaled_ns1, scaled_ns2]))

    #epsilon-greedy
    rand = rand_un()
    if rand < EPSILON:
        n_action = random.choice(actions)
    else:
        n_action = np.argmax(np.sum(weights[:, hash_ns], axis=1))

    #learning and update traces
    q = np.sum(weights[old_action, hash_s])
    nq = np.sum(weights[n_action, hash_ns])

    z[old_action, hash_s] = 1.

    weights += ALPHA * (reward + GAMMA * nq - q) * z
    z *= GAMMA * LAMBDA

    old_state = hash_ns
    old_action = n_action
    return n_action
示例#9
0
文件: agent.py 项目: Rosevear/RL
def approx_value(state, weights):
    global iht
    """
    Return the current approximated value for state given weights, and the gradient for the state,
    which is simply the feature vector for the state
    """

    if AGENT == "STATE_AGG":
        feature_vector = np.array([0.0 for weight in range(weights.shape[1])])
        state_groups = [group for group in range(1, NUM_STATES + AGGREGATE_SIZE, AGGREGATE_SIZE)]
        for i in range(len(state_groups) - 1):
            if state >= state_groups[i] and state <= state_groups[i + 1]:
                feature_vector[i] = 1
                break
    elif AGENT == "TABULAR":
        feature_vector = np.array([0.0 for weight in range(weights.shape[1])])
        feature_vector[state - 1] = 1
    elif AGENT == "POLYNOMIAL":
        feature_vector = np.array([float((state / NUM_STATES) ** degree) for degree in range(POLY_DEGREE + 1)])
    elif AGENT == "TILE_CODING":
        cur_state_rep = float((state / NUM_STATES) * (1 / TILE_WIDTH)) #Do this to get the right tile width
        cur_tiles = tiles(iht, NUM_TILINGS, [cur_state_rep])
        estimate = 0
        for tile in cur_tiles:
            estimate += weights[0][tile]
        return (estimate, cur_tiles)
    else:
        exit("Invalid agent selection!")
    feature_vector = feature_vector[np.newaxis]
    return (np.dot(weights, np.transpose(feature_vector)), feature_vector)
    def train(self, states, actions, targets):
        assert isinstance(states, np.ndarray)
        assert isinstance(actions, np.ndarray)
        assert isinstance(targets, np.ndarray)
        assert states.ndim == 2
        assert actions.ndim == 1
        assert targets.ndim == 1
        assert len(states) == len(actions) == len(targets)

        for i in range(len(states)):
            state = states[i]
            action = actions[i]
            target = targets[i]

            assert len(state) == self._n_dim
            assert np.isscalar(action)
            assert np.isscalar(target)

            scaled_state = np.multiply(
                self._scales, state)  # scale state to map to tiles correctly
            active_tiles = tiles3.tiles(  # find active tiles
                self._iht, self._num_tilings, scaled_state, [action])
            value = np.sum(
                self._weights[active_tiles])  # q-value for state-action pair
            delta = self._lr * (target - value)  # grad is [0,1,0,0,..]
            self._weights[
                active_tiles] += delta  # ..so we pick active weights instead
def get_vector(state):
    #print state
    t = tiles(iht, num_tilings, [float(state) / 200])
    vector = np.zeros(total_states)
    for i in t:
        vector[i] = 1.0
    return vector
示例#12
0
 def _tileEncode_decoder(self, p, v, action):
     '''
     This method scales the statespace onto a 10x10X10 grid
     '''
     v_scaleFactor = 10/(0.14)
     p_scaleFactor = 10/(1.8)
     action_scaleFactor = 10/2
     return tiles(self.iht, self.numTilings, [p*p_scaleFactor,v*v_scaleFactor, action*action_scaleFactor])
示例#13
0
def x_tile(state):
    global iht, agent_type
    indices = tiles(iht, num_tilings[agent_type],
                    (state - 1) / (size * tile_widths[agent_type]))
    t = np.zeros((IHT_SIZE,
                  1))  #(num_tilings[agent_type] / tile_widths[agent_type], 1))
    t[indices] = 1.0
    return t
示例#14
0
 def generateFeatures(self, observation, action):
     positionScale = NUM_TILINGS / (0.5 + 1.2)
     velocityScale = NUM_TILINGS / (0.07 + 0.07)
     features = tiles(
         self.iht, NUM_TILINGS,
         [observation[0] * positionScale, observation[1] * velocityScale],
         [action])
     return np.array(features)
示例#15
0
def embed(state, action):
    """
    States are embedded in {0, 1}^4096 using tile coding
    See Richard S. Sutton's http://incompleteideas.net/tiles/tiles3.html
    """
    return tiles(
        iht, 8, [8 * (state[0] / (0.5 + 1.2)), 8 * (state[1] / (0.07 + 0.07))],
        [action])
示例#16
0
 def eval(self, state, action):
     assert len(state) == self._n_dim
     assert np.isscalar(action)
     scaled_state = np.multiply(
         self._scales, state)  # scale state to map to tiles correctly
     active_tiles = tiles3.tiles(  # find active tiles
         self._iht, self._num_tilings, scaled_state, [action])
     return np.sum(
         self._weights[active_tiles])  # pick correct weights and sum up
    def get_tiles(self, position, velocity):
        position_scaled = (position + 1.2) / (0.5 + 1.2) * self.num_tiles
        velocity_scaled = (velocity + 0.07) / (0.07 + 0.07) * self.num_tiles

        # get the tiles using tc.tiles, with self.iht, self.num_tilings and [scaled position, scaled velocity]
        # nothing to implment here
        tiles = tc.tiles(self.iht, self.num_tilings, [position_scaled, velocity_scaled])

        return np.array(tiles)
def my_tiles(state, action):
    # return tiles(iht, num_tilings, [(state[0] + 1.2) / (1.2 + 0.5) * tiling[0], (state[1] + 0.07) / (0.07 + 0.07)
    #                                 * tiling[1]], [action])
    # A = [action]
    pos = state[0]
    vel = state[1]
    return tiles(
        iht, num_tilings,
        [tiling[0] * pos / (0.5 + 1.2), tiling[1] * vel / (0.07 + 0.07)],
        [action])
    def get_tiles(self, state, action):
        """Get the encoded state_action using grid tiling.
        Ultimate resolution = 1/16.
        :param state: (x, x_dot)
        :param action: {-1, 0, 1}
        :return:
        """
        x, x_dot = state
        #return tiles(self.iht, self.num_tilings, [x * 8, x_dot * 8, action])

        return tiles(self.iht, self.num_tilings, [x * 8/(0.5 + 1.2), x_dot * 8/(0.07+0.07)], [action])
示例#20
0
def agent_step(reward, state):
    global actions, old_state, iht, weights

    #tile-coding
    scaled_s = 1. * NUM_TILES * (old_state - 1) / 999
    scaled_ns = 1. * NUM_TILES * (state[0] - 1) / 999
    hash_s = np.asarray(tiles(iht, NUM_TILINGS, [scaled_s]))
    hash_ns = np.asarray(tiles(iht, NUM_TILINGS, [scaled_ns]))

    v = np.sum(weights[hash_s])
    nv = np.sum(weights[hash_ns])

    #learning
    s_features = np.zeros_like(weights)
    s_features[hash_s] = 1.
    weights += ALPHA * (reward + GAMMA * nv - v) * s_features

    old_state = state[0]
    action = random.choice(actions)
    return action
    def mytiles(self, s, a):
        '''
		Wrapper method to produce binary feature of state-action pair (s, a). It
		returns a list of self.numTilings numbers that denote the indices of active tile.
		'''
        assert (len(s) == len(self.stateLow) and 0 <= a < self.numActions)

        return tiles(self.iht,
                     self.numTilings,
                     list(self.scalingFactor * s),
                     ints=[a])
    def get_tiles(self, position, velocity):
        POSITION_MIN = -1.2
        POSITION_MAX = 0.5
        VELOCITY_MIN = -0.07
        VELOCITY_MAX = 0.07

        position_scale = self.num_tiles / (POSITION_MAX - POSITION_MIN)
        velocity_scale = self.num_tiles / (VELOCITY_MAX - VELOCITY_MIN)

        return tc.tiles(self.ith, self.num_tilings,
                        [position * position_scale, velocity * velocity_scale])
示例#23
0
    def get_phi(self, S, A=None):
        indicies = tiles3.tiles(self.iht, self.num_tilings, self.get_inputs(S))

        if A is None:
            phi = np.zeros([self.num_tiles])
            for idx in indicies:
                phi[idx] = 1
            return phi
        else:
            phi = np.zeros([self.num_tiles * self.num_actions])
            for idx in indicies:
                phi[self.num_tiles * A + idx] = 1
            return phi
示例#24
0
 def train(self, state, action, target):
     assert len(state) == self._n_dim
     assert np.isscalar(action)
     assert np.isscalar(target)
     scaled_state = np.multiply(
         self._scales, state)  # scale state to map to tiles correctly
     active_tiles = tiles3.tiles(  # find active tiles
         self._iht, self._num_tilings, scaled_state, [action])
     value = np.sum(
         self._weights[active_tiles])  # q-value for state-action pair
     delta = self._lr * (target - value)  # grad is [0,1,0,0,..]
     self._weights[
         active_tiles] += delta  # ..so we pick active weights instead
示例#25
0
def agent_message(in_message):  # returns string, in_message: string
    global state_features, weights
    if in_message == "Values":
        estimates = np.zeros(1000)
        for s in range(1, 1001):
            scaled_s = 1. * NUM_TILES / 999 * (s - 1
                                               )  # scaled the state to [0,10)
            hash_s = np.asarray(tiles(iht, NUM_TILINGS, [scaled_s]))
            v = np.sum(weights[hash_s])

            estimates[s - 1] = v
        return estimates
    else:
        return "I don't know what to return!!"
def agent_start(state):
    """
    Hint: Initialize the variavbles that you want to reset before starting a new episode
    Arguments: state: numpy array
    Returns: action: integer
    """
    global last_state, state1, last_action

    x = 8 * state[0] / (0.5 + 1.2)
    xdot = 8 * state[1] / (0.07 + 0.07)
    current_state = [x, xdot]

    feature1 = np.zeros(1944)
    feature_list1 = tiles(iht, 8, current_state, [0])

    feature2 = np.zeros(1944)
    feature_list2 = tiles(iht, 8, current_state, [1])

    feature3 = np.zeros(1944)
    feature_list3 = tiles(iht, 8, current_state, [2])

    for i in feature_list1:
        feature1[i] = 1
    for i in feature_list2:
        feature2[i] = 1
    for i in feature_list3:
        feature3[i] = 1

    v1 = np.dot(w, feature1)
    v2 = np.dot(w, feature2)
    v3 = np.dot(w, feature3)

    action = np.argmax([v1, v2, v3])
    last_action = action
    last_state = current_state

    return action
示例#27
0
    def extract(self, obs):
        """
        Set a list of indices to 1 based on observation
        Args:
            obs (list): List of floats

        Returns:
            Vector with all zeros except the list of indices set to 1
        """
        obs = np.array(obs)
        state = np.zeros(self.state_size)
        idx = tiles(self.iht, self.n_tilings,
                    (obs - self.limits[:, 0]) * self.scaling)
        state[idx] = 1
        return state
    def get_tiles(self, position, velocity):

        POSITION_MIN = -1.2
        POSITION_MAX = 0.6
        VELOCITY_MIN = -0.07
        VELOCITY_MAX = 0.07

        position_scale = self.num_tiles / (POSITION_MAX - POSITION_MIN)
        velocity_scale = self.num_tiles / (VELOCITY_MAX - VELOCITY_MIN)

        tiles = tc.tiles(
            self.iht, self.num_tilings,
            [position * position_scale, velocity * velocity_scale])

        return np.array(tiles)
def agent_end(reward):

    global w, Value_func
    """
    Arguments: reward: floating point
    Returns: Nothing
    """
    delta = reward
    last_indices = tiles(iht, 8, last_state, [last_action])
    for i in last_indices:
        delta -= w[i]
        z[i] = 1
    w += alpha * z * delta

    return
示例#30
0
def agent_end(reward):
    global actions, old_state, iht, weights

    #tile-coding
    scaled_s = 1. * NUM_TILES * (old_state -
                                 1) / 999  # scaled the state to [0,10)
    hash_s = np.asarray(tiles(iht, NUM_TILINGS, [scaled_s]))

    #learning
    s_features = np.zeros_like(weights)
    s_features[hash_s] = 1
    v = np.sum(weights[hash_s])

    weights += ALPHA * (reward - v) * s_features

    return