def test_multiple_inserts(self): indices = tc.tiles(self.iht, 8, [3.6, 7.21]) self.assertEquals(indices, [0, 1, 2, 3, 4, 5, 6, 7]) indices = tc.tiles(self.iht, 8, [3.7, 7.21]) self.assertEquals(indices, [0, 1, 2, 8, 4, 5, 6, 7]) indices = tc.tiles(self.iht, 8, [4, 7]) self.assertEquals(indices, [9, 10, 11, 8, 4, 12, 6, 7]) indices = tc.tiles(self.iht, 8, [-37.2, 7]) self.assertEquals(indices, [13, 14, 15, 16, 17, 18, 19, 20])
def get_tiles(iht, num_tilings, tile_scale, state, action=None): # # use tile coding to construct the feature vector if action == None: mytiles = tiles(iht, num_tilings, state * tile_scale) else: mytiles = tiles(iht, num_tilings, state * tile_scale, [action]) # feature_vec = self.feature_vec_init # feature_vec[mytiles] = 1 return mytiles
def agent_step( reward, state ): # returns NumPy array, reward: floating point, this_observation: NumPy array """ Arguments: reward: floting point, state: integer Returns: action: floating point """ global last_state, w, Value_func, z, last_action delta = reward x = 8 * state[0] / (0.5 + 1.2) xdot = 8 * state[1] / (0.07 + 0.07) current_state = [x, xdot] last_indices = tiles(iht, 8, last_state, [last_action]) for i in last_indices: delta -= w[i] z[i] = 1 feature1 = np.zeros(1944) feature_list1 = tiles(iht, 8, current_state, [0]) feature2 = np.zeros(1944) feature_list2 = tiles(iht, 8, current_state, [1]) feature3 = np.zeros(1944) feature_list3 = tiles(iht, 8, current_state, [2]) for i in feature_list1: feature1[i] = 1 for i in feature_list2: feature2[i] = 1 for i in feature_list3: feature3[i] = 1 v1 = np.dot(w, feature1) v2 = np.dot(w, feature2) v3 = np.dot(w, feature3) action = np.argmax([v1, v2, v3]) indices = tiles(iht, 8, current_state, [action]) for i in indices: delta += w[i] w += alpha * z * delta z = z * gamma * lam last_action = action last_state = current_state return action
def get_tiles(self, position, velocity): """ Takes in a position and velocity from the mountaincar environment and returns a numpy array of active tiles. Arguments: position -- float, the position of the agent between -1.2 and 0.5 velocity -- float, the velocity of the agent between -0.07 and 0.07 returns: tiles - np.array, active tiles """ POSITION_MIN = -1.2 POSITION_MAX = 0.5 VELOCITY_MIN = -0.07 VELOCITY_MAX = 0.07 position_scale = self.num_tiles / (POSITION_MAX - POSITION_MIN) velocity_scale = self.num_tiles / (VELOCITY_MAX - VELOCITY_MIN) tiles = tc.tiles( self.iht, self.num_tilings, [position * position_scale, velocity * velocity_scale]) return np.array(tiles)
def mytiles(x, y, action): global numTilings, tile_width, iht scaleFactor1 = tile_width / (0.5 - (-1.2)) #scale factor for the position scaleFactor2 = tile_width / (0.07 - (-0.07)) #scale factor for the velocity return tiles(iht, numTilings, [x * scaleFactor1, y * scaleFactor2], [action])
def feature(self, state, action): x, xdot = state a = action - 1 idx = tiles3.tiles(self.iht,8,[8.0*x/(0.5+1.2), 8.0*xdot/(0.07+0.07)],[a]) fea = np.zeros((2048,1),dtype=float) fea[idx] = 1.0 return fea
def q(hashtable, state, action, weights): active_tiles = tiles( hashtable, NUM_TILES, np.multiply(state, [1.5, 1.5, 1.5, 1.5, 1.5 / 12.566371, 1.5 / 28.274334]), [action]) return sum(weights[active_tiles])
def agent_step(reward, state): global actions, old_state, old_action, iht, weights, z #tile-coding scaled_ns1 = 1. * NUM_TILES * (state[0] - POSITION[0]) / (POSITION[1] - POSITION[0]) scaled_ns2 = 1. * NUM_TILES * (state[1] - VELOCITY[0]) / (VELOCITY[1] - VELOCITY[0]) hash_s = old_state hash_ns = np.asarray(tiles(iht, NUM_TILINGS, [scaled_ns1, scaled_ns2])) #epsilon-greedy rand = rand_un() if rand < EPSILON: n_action = random.choice(actions) else: n_action = np.argmax(np.sum(weights[:, hash_ns], axis=1)) #learning and update traces q = np.sum(weights[old_action, hash_s]) nq = np.sum(weights[n_action, hash_ns]) z[old_action, hash_s] = 1. weights += ALPHA * (reward + GAMMA * nq - q) * z z *= GAMMA * LAMBDA old_state = hash_ns old_action = n_action return n_action
def approx_value(state, weights): global iht """ Return the current approximated value for state given weights, and the gradient for the state, which is simply the feature vector for the state """ if AGENT == "STATE_AGG": feature_vector = np.array([0.0 for weight in range(weights.shape[1])]) state_groups = [group for group in range(1, NUM_STATES + AGGREGATE_SIZE, AGGREGATE_SIZE)] for i in range(len(state_groups) - 1): if state >= state_groups[i] and state <= state_groups[i + 1]: feature_vector[i] = 1 break elif AGENT == "TABULAR": feature_vector = np.array([0.0 for weight in range(weights.shape[1])]) feature_vector[state - 1] = 1 elif AGENT == "POLYNOMIAL": feature_vector = np.array([float((state / NUM_STATES) ** degree) for degree in range(POLY_DEGREE + 1)]) elif AGENT == "TILE_CODING": cur_state_rep = float((state / NUM_STATES) * (1 / TILE_WIDTH)) #Do this to get the right tile width cur_tiles = tiles(iht, NUM_TILINGS, [cur_state_rep]) estimate = 0 for tile in cur_tiles: estimate += weights[0][tile] return (estimate, cur_tiles) else: exit("Invalid agent selection!") feature_vector = feature_vector[np.newaxis] return (np.dot(weights, np.transpose(feature_vector)), feature_vector)
def train(self, states, actions, targets): assert isinstance(states, np.ndarray) assert isinstance(actions, np.ndarray) assert isinstance(targets, np.ndarray) assert states.ndim == 2 assert actions.ndim == 1 assert targets.ndim == 1 assert len(states) == len(actions) == len(targets) for i in range(len(states)): state = states[i] action = actions[i] target = targets[i] assert len(state) == self._n_dim assert np.isscalar(action) assert np.isscalar(target) scaled_state = np.multiply( self._scales, state) # scale state to map to tiles correctly active_tiles = tiles3.tiles( # find active tiles self._iht, self._num_tilings, scaled_state, [action]) value = np.sum( self._weights[active_tiles]) # q-value for state-action pair delta = self._lr * (target - value) # grad is [0,1,0,0,..] self._weights[ active_tiles] += delta # ..so we pick active weights instead
def get_vector(state): #print state t = tiles(iht, num_tilings, [float(state) / 200]) vector = np.zeros(total_states) for i in t: vector[i] = 1.0 return vector
def _tileEncode_decoder(self, p, v, action): ''' This method scales the statespace onto a 10x10X10 grid ''' v_scaleFactor = 10/(0.14) p_scaleFactor = 10/(1.8) action_scaleFactor = 10/2 return tiles(self.iht, self.numTilings, [p*p_scaleFactor,v*v_scaleFactor, action*action_scaleFactor])
def x_tile(state): global iht, agent_type indices = tiles(iht, num_tilings[agent_type], (state - 1) / (size * tile_widths[agent_type])) t = np.zeros((IHT_SIZE, 1)) #(num_tilings[agent_type] / tile_widths[agent_type], 1)) t[indices] = 1.0 return t
def generateFeatures(self, observation, action): positionScale = NUM_TILINGS / (0.5 + 1.2) velocityScale = NUM_TILINGS / (0.07 + 0.07) features = tiles( self.iht, NUM_TILINGS, [observation[0] * positionScale, observation[1] * velocityScale], [action]) return np.array(features)
def embed(state, action): """ States are embedded in {0, 1}^4096 using tile coding See Richard S. Sutton's http://incompleteideas.net/tiles/tiles3.html """ return tiles( iht, 8, [8 * (state[0] / (0.5 + 1.2)), 8 * (state[1] / (0.07 + 0.07))], [action])
def eval(self, state, action): assert len(state) == self._n_dim assert np.isscalar(action) scaled_state = np.multiply( self._scales, state) # scale state to map to tiles correctly active_tiles = tiles3.tiles( # find active tiles self._iht, self._num_tilings, scaled_state, [action]) return np.sum( self._weights[active_tiles]) # pick correct weights and sum up
def get_tiles(self, position, velocity): position_scaled = (position + 1.2) / (0.5 + 1.2) * self.num_tiles velocity_scaled = (velocity + 0.07) / (0.07 + 0.07) * self.num_tiles # get the tiles using tc.tiles, with self.iht, self.num_tilings and [scaled position, scaled velocity] # nothing to implment here tiles = tc.tiles(self.iht, self.num_tilings, [position_scaled, velocity_scaled]) return np.array(tiles)
def my_tiles(state, action): # return tiles(iht, num_tilings, [(state[0] + 1.2) / (1.2 + 0.5) * tiling[0], (state[1] + 0.07) / (0.07 + 0.07) # * tiling[1]], [action]) # A = [action] pos = state[0] vel = state[1] return tiles( iht, num_tilings, [tiling[0] * pos / (0.5 + 1.2), tiling[1] * vel / (0.07 + 0.07)], [action])
def get_tiles(self, state, action): """Get the encoded state_action using grid tiling. Ultimate resolution = 1/16. :param state: (x, x_dot) :param action: {-1, 0, 1} :return: """ x, x_dot = state #return tiles(self.iht, self.num_tilings, [x * 8, x_dot * 8, action]) return tiles(self.iht, self.num_tilings, [x * 8/(0.5 + 1.2), x_dot * 8/(0.07+0.07)], [action])
def agent_step(reward, state): global actions, old_state, iht, weights #tile-coding scaled_s = 1. * NUM_TILES * (old_state - 1) / 999 scaled_ns = 1. * NUM_TILES * (state[0] - 1) / 999 hash_s = np.asarray(tiles(iht, NUM_TILINGS, [scaled_s])) hash_ns = np.asarray(tiles(iht, NUM_TILINGS, [scaled_ns])) v = np.sum(weights[hash_s]) nv = np.sum(weights[hash_ns]) #learning s_features = np.zeros_like(weights) s_features[hash_s] = 1. weights += ALPHA * (reward + GAMMA * nv - v) * s_features old_state = state[0] action = random.choice(actions) return action
def mytiles(self, s, a): ''' Wrapper method to produce binary feature of state-action pair (s, a). It returns a list of self.numTilings numbers that denote the indices of active tile. ''' assert (len(s) == len(self.stateLow) and 0 <= a < self.numActions) return tiles(self.iht, self.numTilings, list(self.scalingFactor * s), ints=[a])
def get_tiles(self, position, velocity): POSITION_MIN = -1.2 POSITION_MAX = 0.5 VELOCITY_MIN = -0.07 VELOCITY_MAX = 0.07 position_scale = self.num_tiles / (POSITION_MAX - POSITION_MIN) velocity_scale = self.num_tiles / (VELOCITY_MAX - VELOCITY_MIN) return tc.tiles(self.ith, self.num_tilings, [position * position_scale, velocity * velocity_scale])
def get_phi(self, S, A=None): indicies = tiles3.tiles(self.iht, self.num_tilings, self.get_inputs(S)) if A is None: phi = np.zeros([self.num_tiles]) for idx in indicies: phi[idx] = 1 return phi else: phi = np.zeros([self.num_tiles * self.num_actions]) for idx in indicies: phi[self.num_tiles * A + idx] = 1 return phi
def train(self, state, action, target): assert len(state) == self._n_dim assert np.isscalar(action) assert np.isscalar(target) scaled_state = np.multiply( self._scales, state) # scale state to map to tiles correctly active_tiles = tiles3.tiles( # find active tiles self._iht, self._num_tilings, scaled_state, [action]) value = np.sum( self._weights[active_tiles]) # q-value for state-action pair delta = self._lr * (target - value) # grad is [0,1,0,0,..] self._weights[ active_tiles] += delta # ..so we pick active weights instead
def agent_message(in_message): # returns string, in_message: string global state_features, weights if in_message == "Values": estimates = np.zeros(1000) for s in range(1, 1001): scaled_s = 1. * NUM_TILES / 999 * (s - 1 ) # scaled the state to [0,10) hash_s = np.asarray(tiles(iht, NUM_TILINGS, [scaled_s])) v = np.sum(weights[hash_s]) estimates[s - 1] = v return estimates else: return "I don't know what to return!!"
def agent_start(state): """ Hint: Initialize the variavbles that you want to reset before starting a new episode Arguments: state: numpy array Returns: action: integer """ global last_state, state1, last_action x = 8 * state[0] / (0.5 + 1.2) xdot = 8 * state[1] / (0.07 + 0.07) current_state = [x, xdot] feature1 = np.zeros(1944) feature_list1 = tiles(iht, 8, current_state, [0]) feature2 = np.zeros(1944) feature_list2 = tiles(iht, 8, current_state, [1]) feature3 = np.zeros(1944) feature_list3 = tiles(iht, 8, current_state, [2]) for i in feature_list1: feature1[i] = 1 for i in feature_list2: feature2[i] = 1 for i in feature_list3: feature3[i] = 1 v1 = np.dot(w, feature1) v2 = np.dot(w, feature2) v3 = np.dot(w, feature3) action = np.argmax([v1, v2, v3]) last_action = action last_state = current_state return action
def extract(self, obs): """ Set a list of indices to 1 based on observation Args: obs (list): List of floats Returns: Vector with all zeros except the list of indices set to 1 """ obs = np.array(obs) state = np.zeros(self.state_size) idx = tiles(self.iht, self.n_tilings, (obs - self.limits[:, 0]) * self.scaling) state[idx] = 1 return state
def get_tiles(self, position, velocity): POSITION_MIN = -1.2 POSITION_MAX = 0.6 VELOCITY_MIN = -0.07 VELOCITY_MAX = 0.07 position_scale = self.num_tiles / (POSITION_MAX - POSITION_MIN) velocity_scale = self.num_tiles / (VELOCITY_MAX - VELOCITY_MIN) tiles = tc.tiles( self.iht, self.num_tilings, [position * position_scale, velocity * velocity_scale]) return np.array(tiles)
def agent_end(reward): global w, Value_func """ Arguments: reward: floating point Returns: Nothing """ delta = reward last_indices = tiles(iht, 8, last_state, [last_action]) for i in last_indices: delta -= w[i] z[i] = 1 w += alpha * z * delta return
def agent_end(reward): global actions, old_state, iht, weights #tile-coding scaled_s = 1. * NUM_TILES * (old_state - 1) / 999 # scaled the state to [0,10) hash_s = np.asarray(tiles(iht, NUM_TILINGS, [scaled_s])) #learning s_features = np.zeros_like(weights) s_features[hash_s] = 1 v = np.sum(weights[hash_s]) weights += ALPHA * (reward - v) * s_features return