def __init__(self,
                 num_actions,
                 state_bounds,
                 tile_coding={
                     'maxSize': 1024,
                     'num_tilings': 8,
                     'num_grids': 10
                 },
                 learning_rate=0.01,
                 discount_factor=0.9,
                 train_result_file=None):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.nA = num_actions
        # for using tile coding to construct the feature vector
        self.tile_coding = tile_coding
        self.maxSize = tile_coding['maxSize']
        self.iht = IHT(self.maxSize)  #initialize the hash table
        self.num_tilings = tile_coding['num_tilings']
        self.tile_scale = tile_coding['num_grids'] / (state_bounds[1] -
                                                      state_bounds[0])
        self.feature_vec_zero = np.zeros(self.maxSize)

        if not train_result_file:
            self.w = np.zeros(self.maxSize)
        else:
            hf = h5py.File(train_result_file, 'r')
            trained_model = hf.get('trained_model')
            self.w = trained_model.get('w').value
            hf.close()
    def __init__(self, alpha, stateLow, stateHigh, numActions):
        assert (len(stateLow) == len(stateHigh))

        self.alpha = alpha
        self.stateLow = stateLow
        self.stateHigh = stateHigh
        self.numActions = numActions

        self.numTilings = findProperNumberOfTilings(len(stateLow))  # 16
        self.tileWidth = np.array([
            3, 3, 1, 1
        ]) * self.numTilings  # tileWidth = ultimate resolution * numTilings
        # Used to re-scale the range of each dimension in state to use Sutton's
        # tile coding software interface.
        self.scalingFactor = 1 / self.tileWidth

        # One advantage of Sutton's tilecoder is the use of hashing. In our problem,
        # the state range is too large so that the size of the resulting weight vector
        # is also prohibitively large. However, we can use his tilecoder by specifying
        # a upper range of returned indices regardless of the state space and tiling numbers.
        # Since the tilecoder is guaranteed to return different active tiles for different
        # input state (or state-action pair) as long as there is unused indices left.
        # This way, we implicitly achieve the desired property of "dynamic tile coding" where
        # the total number of tiles stays unchanged but give more resolution to state spaces
        # that are visited more often.
        maxSize = np.prod(np.ceil(
            (self.stateHigh - self.stateLow) / self.tileWidth),
                          dtype=int) * self.numTilings * self.numActions
        self.iht = IHT(maxSize)
        self.w = np.zeros(maxSize)
Exemplo n.º 3
0
def agent_init():
    global iht, agent_type, w, possible_actions, fourier_order, seed

    np.random.seed(seed)
    """
    Hint: Initialize the variables that need to be reset before each run begins
    Returns: nothing
    """
    # initialize the policy array in a smart way
    possible_actions = np.concatenate((np.arange(-k, 0), np.arange(1, k + 1)))
    if agent_type == "tabular":
        w = np.zeros((size, 1))
    elif agent_type == "tile":
        w = np.zeros(
            (IHT_SIZE,
             1))  # (num_tilings[agent_type] / tile_widths[agent_type], 1))
        iht = IHT(
            IHT_SIZE
        )  # num_tilings[agent_type] / tile_widths[agent_type])  # 13  # size * k)
    elif agent_type == "aggregation":
        w = np.zeros((num_tilings[agent_type] / tile_widths[agent_type], 1))
    elif agent_type == "fourier":
        w = np.zeros((2 * fourier_order + 1, 1))
    else:
        print("Error")
        exit(213)
Exemplo n.º 4
0
def agent_init():
    global actions, iht, weights
    # choose number of action
    actions = [0, 1, 2]
    iht = IHT(MAX_SIZE)
    # 3xn matrix
    weights = np.random.uniform(-0.001, 0, (len(actions), MAX_SIZE))
    def __init__(self, mcar, num_tiling=8, max_size=4096):
        self.num_tiling = num_tiling

        self.iht = IHT(max_size)
        self.weights = np.zeros(max_size, dtype=np.float)

        self.x_scale = self.num_tiling / (mcar.x_max - mcar.x_min)
        self.v_scale = self.num_tiling / (mcar.v_max - mcar.v_min)
Exemplo n.º 6
0
def agent_init():
    global weights, iht, e_trace

    iht = IHT(IHT_SIZE)
    weights = np.array(
        [random.uniform(-0.001, 0) for weight in range(IHT_SIZE)])
    weights = weights[np.newaxis, :]
    e_trace = np.zeros(IHT_SIZE)
    e_trace = e_trace[np.newaxis, :]
def agent_init():
    global iht, w, seed

    # np.random.seed(seed)
    """
    Hint: Initialize the variables that need to be reset before each run begins
    Returns: nothing
    """
    # initialize the policy array in a smart way
    # w = np.zeros((IHT_SIZE, 1))  # (num_tilings[agent_type] / tile_widths[agent_type], 1))
    w = -0.001 * np.random.uniform(size=(IHT_SIZE, 1))
    iht = IHT(
        IHT_SIZE
    )  # num_tilings[agent_type] / tile_widths[agent_type])  # 13  # size * k)
Exemplo n.º 8
0
    def __init__(self, environment=ServerEnv(), num_of_tiles=8):
        self.env = environment
        self.state = self.env.get_state()
        self.state_low_bound = self.env.observation_space.low
        self.state_high_bound = self.env.observation_space.high
        self.n_action = env.action_space.n
        self.action_space = gym.spaces.Discrete(self.n_action)
        self.num_of_tiles = num_of_tiles
        self.d = 2048
        self.w = np.zeros(self.d)

        self.hash_table = IHT(self.d)
        self.s0_scale = 1.0 * self.d / self.env.n
        self.s1_scale = 1.0 * self.d / (len(self.env.priority) - 1)
Exemplo n.º 9
0
 def __init__(self, n_tiles, n_tilings, limits):
     """
     Args:
         n_tiles (list or 1D array): Number of tiles in each dimension
         n_tilings (int): Number of tilings
         limits (list): List of (min, max) tuples for each dimension
     """
     super().__init__()
     self.n_tiles = np.array(n_tiles)
     self.n_tilings = n_tilings
     self.state_size = n_tilings * np.prod(self.n_tiles)
     self.iht = IHT(self.state_size)
     self.limits = np.array(limits)
     self.scaling = self.n_tiles / (self.limits[:, 1] - self.limits[:, 0])
Exemplo n.º 10
0
    def reset(self):
        # set up tiles to extract discrete feature represetation of the continuous mdp state rep and discrete action rep
        self.iht = IHT(self.max_size)
        self.state_scale_factor = [
            self.num_tiles / abs(self.mdp.x_max - self.mdp.x_min),
            self.num_tiles / abs(self.mdp.xdot_max - self.mdp.xdot_min)
        ]

        # setup a tiles fn which returns a list of the active tiles given the state, action pair
        self.tiles = lambda state, action: tiles(self.iht, self.num_tilings, [
            state[0] * self.state_scale_factor[0], state[1] * self.
            state_scale_factor[1]
        ], [action])
        self.w = np.zeros(self.max_size)
        self.num_updates = 0
        self.w_history = []
    def __init__(self, iht_size, num_tilings, num_tiles):
        '''
        Initializes the MountainCar Tile Coder
        iht_size -- int, the size of the index hash table, typically a power of 2
        num_tilings -- int, the number of tillings
        num_tiles -- int, the number of tiles. Both width and height of the tile coder are the same.

        Class Variables:
        self.iht -- IHT, the index hash table that the tile coder will use
        self.num_tilings -- int, the number of tilings the tile coder will use
        self.num_tiles -- int, the number of tiles the tile coder will use
        '''

        self.iht = IHT(iht_size)
        self.num_tilings = num_tilings
        self.num_tiles = num_tiles
    def __init__(self,
                 num_actions,
                 state_bounds,
                 tile_coding={
                     'maxSize': 1024,
                     'num_tilings': 8,
                     'num_grids': 10
                 },
                 learning_rate_w=0.01,
                 learning_rate_theta=0.01,
                 lambda_w=0.8,
                 lambda_theta=0.8,
                 discount_factor=0.9,
                 train_result_file=None):
        self.learning_rate_w = learning_rate_w
        self.learning_rate_theta = learning_rate_theta
        self.discount_factor = discount_factor
        self.lambda_w = lambda_w
        self.lambda_theta = lambda_theta

        self.nA = num_actions
        # for using tile coding to construct the feature vector
        self.tile_coding = tile_coding
        self.maxSize = tile_coding['maxSize']
        self.iht = IHT(self.maxSize)  #initialize the hash table
        self.num_tilings = tile_coding['num_tilings']
        self.tile_scale = tile_coding['num_grids'] / (state_bounds[1] -
                                                      state_bounds[0])
        self.feature_vec_zero = np.zeros(self.maxSize)

        if not train_result_file:
            self.w = np.zeros(
                self.maxSize)  # parameter vector for value estimator
            self.theta = np.zeros(
                self.maxSize)  # parameter vector for value estimator
            self.e_w = np.zeros(self.maxSize)  # eligibility trace vector for w
            self.e_theta = np.zeros(
                self.maxSize)  # eligibility trace vector for theta
        else:
            hf = h5py.File(train_result_file, 'r')
            trained_model = hf.get('trained_model')
            self.w = trained_model.get('w').value
            self.theta = trained_model.get('theta').value
            self.e_w = trained_model.get('e_w').value
            self.e_theta = trained_model.get('e_theta').value
            hf.close()
Exemplo n.º 13
0
    def reset(self):
        # set up tiles to extract discrete feature represetation of the continuous mdp state rep and discrete action rep
        self.iht = IHT(self.max_size)
        self.state_scale_factor = [
            self.num_tiles / abs(self.mdp.x_max - self.mdp.x_min),
            self.num_tiles / abs(self.mdp.xdot_max - self.mdp.xdot_min)
        ]

        # setup a tiles fn which returns a list of the active tiles given the state, action pair
        self.tiles = lambda state, action: tiles(self.iht, self.num_tilings, [
            state[0] * self.state_scale_factor[0], state[1] * self.
            state_scale_factor[1]
        ], [action])

        # setup weight vector for linear function approximation
        self.w = np.zeros(self.max_size)
        self.total_rewards = 0
Exemplo n.º 14
0
Arquivo: agent.py Projeto: Rosevear/RL
def agent_init():
    global weights, iht

    if AGENT == "STATE_AGG":
        weights = np.array([0.0 for weight in range(1, NUM_STATES, AGGREGATE_SIZE)])
    elif AGENT == "TABULAR":
        weights = np.array([0.0 for weight in range(NUM_STATES)])
    elif AGENT == "POLYNOMIAL":
        weights = np.array([0.0 for weight in range(POLY_DEGREE + 1)])
    elif AGENT == "RADIAL":
        weights = np.array([0.0])
    elif AGENT == "TILE_CODING":
        iht = IHT(IHT_SIZE)
        weights = np.array([0.0 for weight in range(IHT_SIZE)])
    else:
        exit("Invalid agent selection!")

    weights = weights[np.newaxis, :]
Exemplo n.º 15
0
    def __init__(self,
                 mcar,
                 replacing=True,
                 clear_trace=False,
                 true_update=False,
                 num_tiling=8,
                 max_size=4096):
        self.num_tiling = num_tiling
        self.replacing = replacing
        self.clear_trace = clear_trace
        self.true_update = true_update

        self.iht = IHT(max_size)
        self.weights = np.zeros(max_size, dtype=np.float)

        self.x_scale = self.num_tiling / (mcar.x_max - mcar.x_min)
        self.v_scale = self.num_tiling / (mcar.v_max - mcar.v_min)

        self.e_trace = np.zeros_like(self.weights)
Exemplo n.º 16
0
    def __init__(self, environment=gym.make('MountainCar-v0'), num_of_tiles=8):
        self.env = environment
        self.state = self.env.reset()
        self.state_low_bound = self.env.observation_space.low
        self.state_high_bound = self.env.observation_space.high
        self.n_action = env.action_space.n

        self.action_space = gym.spaces.Discrete(self.n_action)

        self.d = 100
        self.w = np.random.rand(self.d)

        self.num_of_tiles = num_of_tiles
        self.d = 4096
        self.w = np.zeros(self.d)

        self.hash_table = IHT(self.d)
        self.s0_scale = 1.0 * self.d / (self.state_high_bound[0] - self.state_low_bound[0])
        self.s1_scale = 1.0 * self.d / (self.state_high_bound[1] - self.state_low_bound[1])
Exemplo n.º 17
0
    def agent_init(self):
        """
        Arguments: Nothing
        Returns: Nothing
        
        """
        # init the number of tilings to 50
        self.num_tiling = 50
        self.total_state = 1000
        self.gamma = 1

        self.tile_width = 0.2
        # tile width is 0.2
        # 5 tiles cover 1000 states
        # add 1 tile for offsetting
        # 6 * num_tilings = 300
        self.max_size = int(((1 / self.tile_width) + 1) * self.num_tiling)

        # init index hash table
        self.iht = IHT(self.max_size)

        # define alpha
        self.alpha = 0.01 / self.num_tiling

        # init weight
        self.weight = np.zeros(self.max_size)

        self.action = None

        # init a variable to keep the last state
        self.last_state = None

        # init state feature
        # for every row, contains binary features obtained by tile coding
        # each row for a single state
        self.x_s = np.zeros((self.total_state, self.max_size))

        # init estimate state value function
        self.v_hat = None

        # init a track for store the states have been tile coding
        self.track = {}
Exemplo n.º 18
0
 def agent_init(self):
     """
     Arguments: Nothing
     Returns: Nothing
     Hint: Initialize the variables that need to be reset before each run
     begins
     """
     if self.mode == "tabular":
         self.feature_vector = np.identity(1001)
         self.w = np.zeros(1001)
         self.alpha = 0.5
     elif self.mode == "tile":
         iht = IHT(1024)
         self.num_tiles = 50
         self.feature_vector = np.zeros((1001, 1024))
         self.w = np.zeros(1024)
         self.alpha = 0.01 / 50
         for s in range(1, 1001):
             tile_result = tiles(iht, self.num_tiles, [s / 200])
             self.feature_vector[s][tile_result] = 1
Exemplo n.º 19
0
 def __init__(self):
     '''
     This is the initialization of the agent class.
     '''
     self.maxSize = 65536
     # self.z = [0]*self.maxSize 
     self.z = np.zeros(self.maxSize)
     self.iht = IHT(self.maxSize)
     # self.weights = [0]*self.maxSize 
     # self.weights = np.random.uniform(-1.0,1.0,self.maxSize)
     self.weights = np.ones(self.maxSize)
     self.numTilings = 8 
     self.stepsize = 0.1/self.numTilings
     self._gamma = 0.9
     self._lambda = 0.2
     self._epsilon = 0.05 
     self._last_action = 0
     # action refactor
     self._current_action = 0 # this gets called and updated in agent._policy()
     self._previous_action = 0 # this gets called and updated in agent.learn()
     #
     self.last_state = [0,0]
Exemplo n.º 20
0
 def __init__(self):
     self.weights = np.random.uniform(-0.001, 0.001, TILES)
     self.iht = IHT(TILES)
     self.q = dict()
     self.features = dict()
Exemplo n.º 21
0
mpl.use('agg')
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import sys, time
from tiles3 import IHT, tiles

register(
    id='MountainCar-v1',
    entry_point='gym.envs.classic_control:MountainCarEnv',
    max_episode_steps=5000,
    reward_threshold=-110.0,
)
env = gym.make('MountainCar-v1')
nA = 3
iht = IHT(4096)


def _greedy(Q, s):
    qmax = np.max(Q(s))
    actions = []
    for i, q in enumerate(Q(s)):
        if q == qmax:
            actions.append(i)
    return actions


def greedy(Q, s):
    return np.random.choice(_greedy(Q, s))

import gym
import numpy as np
import matplotlib.pyplot as plt
import math
from tiles3 import IHT, tiles

# Episodic Semi-gradient Sarsa Implementation (on-policy control)
NUM_TILES = 16
MAX_SIZE = 2**16

iht = IHT(MAX_SIZE)


def get_epsilon(episode, num_episodes):
    if (episode < 1000):
        return 1.0 - (episode) / 1000.0
    else:
        return 0.05


def q(hashtable, state, action, weights):
    active_tiles = tiles(hashtable, NUM_TILES, state, [action])
    return sum(weights[active_tiles])


def epsilon_greedy_action(hashtable, weights, state, e):
    if (np.random.uniform(0, 1) < e):
        action = env.action_space.sample()
    else:
        state_action_values = []
        for a in [0, 1, 2, 3]:
    num_tilings = 16
    alpha = (1 / num_tilings)  #recommended alpha by Rich Sutton
    tiling_number = 10  #num_rows of tiles per tiling. So total number is dimensionality^obs_dim * num_tilings

    project_models = []
    tilings = []  #List of tilings, each for one dimension
    weights = []  #List of weights, one for each dimension

    scalers = []

    for k in range(observations_dim):  #let us build all the requisite weights

        project_models.append(
            load('./PCA Models/pca_pipeline_{}.joblib'.format(k + 1)))
        tilings.append(IHT(num_tilings * np.power(
            tiling_number, k + 1)))  #tile coding seperate for each dimension

        weights.append(
            np.zeros(
                [num_tilings * np.power(tiling_number, k + 1), num_actions]))

    stepcount = np.zeros([num_episodes, 1])
    gamma = 0.99  #discount factor
    epsilon = 0.1  #set exploration parameter as desired

    def compute_tile_indices(state, dimension):

        c = tiles(tilings[dimension], num_tilings,
                  tiling_number *
                  state.flatten())  #use the appropriate tiling as desired
        return c
 def __init__(self, iht_size=IHT_DEFAULT_SIZE, num_tilings=NUM_TILINGS):
     self.iht = IHT(iht_size)
     self.num_tilings = num_tilings
"""
  Author: Adam White, Matthew Schlegel, Mohammad M. Ajallooeian, Sina Ghiassian
  Purpose: Skeleton code for On-Policy Sarsa Control Agent
           for use on A4 of Reinforcement learning course University of Alberta Fall 2017
 
"""

from tiles3 import tiles, IHT
from utils import rand_in_range, rand_un
import numpy as np
import pickle
import decimal
import random

maxSize = 1000 * 50  #big size for the hash vector (states*tilings)
iht = IHT(maxSize)
weights = None
numTilings = 50  #number of tilings
stepSize = 0.01 / numTilings  #alpha
feature_vectors = None
last_state = None  #last state
tile_width = 0.2  #tile width
v = None  #estimates


# function to return the tilings corresponding to an state (x)
def mytiles(x):
    global numTilings, tile_width
    return tiles(iht, numTilings, [x * tile_width])

def agent_init():
    global w, iht
    w = np.zeros(total_states)
    iht = IHT(total_states)
Exemplo n.º 27
0
 def __init__(self, t_num=4096):
     self.aSpace = [0, 1, 2]
     self.w = np.zeros(t_num)
     self.env = gym.make('MountainCar-v0')
     self.env._max_episode_steps = 500
     self.iht = IHT(t_num)
Exemplo n.º 28
0
def agent_init():
    global actions, iht, weights
    # choose number of action
    actions = ['left', 'right']
    iht = IHT(MAX_SIZE)
    weights = np.zeros(MAX_SIZE)
Exemplo n.º 29
0
from rl_glue import *  # Required for RL-Glue

RLGlue("mountaincar", "sarsa_lambda_agent")

import numpy as np
from tiles3 import tiles, IHT

memorySize = 4096
num_tilings = 8
alpha = 0.1 / (num_tilings)
lamb = 0.9
epsilon = 0.0
w = []
z = []
discount = 1
iht = IHT(memorySize)


def my_tiles(state, action):
    (x, xdot) = state
    return tiles(iht, num_tilings,
                 [8.0 * x / (0.5 + 1.2), 8.0 * xdot / (0.07 + 0.07)], [action])


def q_hat(indices, w):
    value = 0
    for t in indices:
        value += w[t]
    return value

Exemplo n.º 30
0
class Agent:

    # Set up parameters for agent
    # State-action pair values
    # Use a function defined below for Q instead of a dict
    #Q = None
    #R_bar = 0

    # Action space, to be configured by the environment
    A = None

    # Agent configuration variables
    epsilon = 0.05

    # Index hash table / number of features
    hash_table_size = 2048
    # number of tilings / number of feature fields
    num_offset_tilings = 8
    # length of one side of tiling
    #tiling_side_length = np.sqrt(hash_table_size)
    tiling_side_length = 8

    # weight vector
    w = None
    #w = None
    iht = IHT(hash_table_size)

    # eligibility trace
    # z[-1] = [0 for in d]
    z = None

    # step size
    alpha = 0.005 / num_offset_tilings
    gamma = 0.9
    lam = 0.9

    tilecache = {}

    # min and max values for environment
    mins = []
    maxs = []

    last_action = None
    last_state = None

    times_selected = {}

    # returns list of feature indices for the given state
    # [feature# for field 1, feature# for field2, ..., feature# for last field]
    # This is a list of all the features with value 1 (activated)
    def active_features(self, state, action):
        sap = (state, action)
        if (sap in self.tilecache):
            return self.tilecache[sap]
        else:
            scaleFactor = [
                self.tiling_side_length / (self.maxs[i] - self.mins[i])
                for i in range(len(self.maxs))
            ]
            # Use distinct integer actions in tiling, resulting in a different
            # tiling for each action
            t = tiles(self.iht, self.num_offset_tilings,
                      [state[i] * scaleFactor[i] for i in range(len(state))],
                      [action])
            self.tilecache[sap] = t
            return t

    # Declare agent variables
    def __init__(self, actions, max_state, min_state):
        self.R_bar = 0
        self.A = actions
        self.mins = np.array(min_state)
        self.maxs = np.array(max_state)

    # Initialize agent variables
    # Run once, in experiments
    def agent_init(self):
        # Assume there are no states (new states can be valued lazily)
        self.pi = {}
        self.returns = {}
        self.v = {}
        self.last_action = None
        self.last_state = None
        self.w = np.zeros(self.hash_table_size)
        self.w = np.full(self.hash_table_size, 0.1)

    # Start agent
    # Runs at the beginning of an episode. The first method called when the experiment
    # starts, called after the environment starts
    # Args:
    #	state (state observation): The agent's current state
    # returns:
    #	The first action the agent takes
    def agent_start(self, state):
        self.tilecache = {}
        self.times_selected = {a: 0 for a in self.A}
        self.z = np.zeros(self.hash_table_size)

        # start episode
        self.last_state = state
        self.last_action = self.epGreedy(state)

        return self.last_action

    # A step taken by the agent
    # Args:
    #	reward (float): the reward received for taking the last action taken
    #	state (state observation): The agen's current state
    # returns:
    #	The action the agent is taking
    def agent_step(self, reward, state):
        # S[t+1]
        Sprime = state
        # R[t+1]
        R = reward
        # S[t]
        S = self.last_state
        # A[t]
        A = self.last_action
        # w[t], self.w = w[t+1]
        w = self.w

        #print(S,A)
        #input(self.active_features(S,A))
        #print("reward", R)
        #print("old", self.Q(S,A,w))

        # error[t]
        error = R
        self.z[self.active_features(S, A)] += 1
        #for feature in self.active_features(S,A):
        #error -= w[feature]
        # accumulating traces
        #self.z[feature] += 1
        # replacing traces
        #	self.z[feature] = 1

        # A[t+1]
        Aprime = self.epGreedy(Sprime)

        self.times_selected[A] += 1
        #print(self.Q(Sprime,Aprime,self.w))
        #print(self.Q(S,A,self.w))

        old = self.Q(S, A, self.w)

        error = R + self.gamma * self.Q(Sprime, Aprime, self.w) - self.Q(
            S, A, self.w)
        # update eligibility trace
        #for feature in self.active_features(Sprime,Aprime):
        #error += self.gamma*w[feature]

        adjusted_error2 = error
        #alpha = 0.005/self.times_selected[A]
        alpha = self.alpha
        #for i in range(len(self.w)):
        #	self.w[i] += alpha*error*self.z[i]
        self.w += alpha * error * self.z
        #for i in range(len(self.w)):
        #	self.z[i] *= self.gamma*self.lam
        self.z *= self.gamma * self.lam

        #print("new", self.Q(S,A,w))
        new = self.Q(S, A, w)
        #print("S",S, "A", A,"R", R)
        #print("qd", self.Q_dict())
        #input()
        #if (new < old):
        if (1 < 0):
            print("####")
            print("old", old)
            print("new", new)
            print("reward", R)
            print("features", self.active_features(S, A))
            print("old weights", oldweights)
            print("new weights", [w[i] for i in self.active_features(S, A)])
            print("adjusted error", adjusted_error)
            print("adjusted error 2", adjusted_error2)
            print("####")

            input()

        self.last_state = Sprime
        self.last_action = Aprime
        return self.last_action

    # Run when the agent termintates
    # Args:
    #	reward (float): the reward the agent received for entering the terminal state
    def agent_end(self, reward):
        #print('terminal')
        R = reward
        S = self.last_state
        A = self.last_action
        w = self.w

        self.times_selected[A] += 1
        #alpha = 0.005/self.times_selected[A]
        alpha = self.alpha
        #print("reward", R)
        #print("old", self.Q(S,A,w))

        # error[t]
        features = self.active_features(S, A)
        error = R - np.sum(w[features])
        self.z[features] += 1

        #for feature in self.active_features(S,A):
        #	error -= w[feature]
        # accumulating traces
        #	self.z[feature] += 1
        # replacing traces
        #self.z[feature] = 1

        #input(str(R) + " " + str(error))

        #for i in range(len(self.w)):
        #	self.w[i] += alpha*error*self.z[i]
        self.w += alpha * error * self.z

        #print("new", self.Q(S,A,w))
        #input()
        #print("tS",S, "A", A,"R", R)
        #print("error", error)
        #print("alpha", alpha)
        #print("qd", self.Q_dict())
        #input()

    # Receive a message form RLGlue
    # Args:
    #	Message (str): the message passed
    # returns:
    #	response (str): The agent's response to the message (optional)
    def agent_message(self, message):
        # Output what the agent thinks is the optimal policy
        if message == "action-values":
            return None

    def pi_select(self, pi, s, A):
        rand = np.random.random()
        probs = self.pi.get(s, {a: 1 / len(A) for a in A})
        val = 0
        for a in A:
            val += probs[a]
            if rand < val:
                return a

    def epGreedy(self, S):
        rand = np.random.random()

        if (rand < self.epsilon):
            return np.random.choice(self.A)
        else:
            maxQ = max([self.Q(S, a, self.w) for a in self.A])
            max_actions = [a for a in self.A if self.Q(S, a, self.w) >= maxQ]
            return np.random.choice(max_actions)

    def Q(self, state, action, w):
        features = self.active_features(state, action)
        return np.sum(w[features])
        #return sum([w[i] for i in features])

    # returns the action with the highest action-value
    # Args:
    #	Q: Action values
    #	s: state to evaluate
    #	A: Actions to pick from
    def argmaxa(self, Q, s, A):

        # Find the highest action value
        maxQ = max([Q.get((s, a), 0) for a in A])
        # Find actions with highest value
        best_actions = [a for a in A if Q.get((s, a), 0) == maxQ]
        # consistently pick from best actions
        return sorted(best_actions)[0]

    # returns the action with the highest action-value
    # Args:
    #	Q: Action values
    #	s: state to evaluate
    #	A: Actions to pick from
    def argmaxa_rand(self, Q, s, A):
        # Find the highest action value
        maxQ = max([Q.get((s, a), 0) for a in A])
        # Find actions with highest value
        best_actions = [a for a in A if Q.get((s, a), 0) == maxQ]
        # Randomly pick from actions with top action value
        return np.random.choice(best_actions)