def __init__(self, transitions, reward, discount, eps=0.01, n_iter=10000, skip_check=False): # Initialise a Q-learning MDP. # The following check won't be done in MDP()'s initialisation, so let's # do it here self.max_iter = int(n_iter) assert self.max_iter >= 10000, "'n_iter' should be greater than 10000." if not skip_check: # We don't want to send this to MDP because _computePR should not # be run on it, so check that it defines an MDP _util.check(transitions, reward) # Store P, S, and A self.S, self.A = _computeDimensions(transitions) self.P = self._computeTransition(transitions) self.R = reward self.discount = discount self.eps = eps # Initialisations self.Q = _np.zeros((self.S, self.A)) self.mean_discrepancy = []
def __init__(self, transitions, reward, discount, start_state, goal_states, n_iter=10000, num_restarts=100, timeout_iters=1000): # Initialise a Q-learning MDP. # The following check won't be done in MDP()'s initialisation, so let's # do it here self.max_iter = int(n_iter) assert self.max_iter >= 10000, "'n_iter' should be greater than 10000." # We don't want to send this to MDP because _computePR should not be # run on it, so check that it defines an MDP _util.check(transitions, reward) # Store P, S, and A self.S, self.A = _computeDimensions(transitions) self.P = self._computeTransition(transitions) self.R = reward self.discount = discount self.start_state = start_state self.goal_states = goal_states self.num_restarts = num_restarts self.timeout_iters = timeout_iters # Initialisations self.Q = _np.zeros((self.S, self.A)) self.mean_discrepancy = []
def __init__(self, transitions, reward, grid, start, goals, n_restarts=1000, alpha = 0.2, gamma = 0.9, rar = 0.9, radr = 0.99, n_iter=100000): # Initialise a Q-learning MDP. # The following check won't be done in MDP()'s initialisation, so let's # do it here self.max_iter = int(n_iter) assert self.max_iter >= 10000, "'n_iter' should be greater than 10000." # We don't want to send this to MDP because _computePR should not be # run on it, so check that it defines an MDP _util.check(transitions, reward) # Store P, S, and A self.S, self.A = _computeDimensions(transitions) self.P = self._computeTransition(transitions) self.R = reward self.alpha = alpha self.gamma = gamma self.rar = rar self.orig_rar = rar self.radr = radr self.start = start self.goals = goals self.n_restarts = n_restarts # Initialisations self.Q = np.random.uniform(-1, 1, (self.S, self.A)) self.tracker = np.zeros(grid.shape) self.ncols = grid.shape[1] self.mean_discrepancy = []
def __init__(self, transitions, reward, discount, epsilon, max_iter, skip_check=True, sparse=False): # Initialise a MDP based on the input parameters. self.sparse = sparse # if the discount is None then the algorithm is assumed to not use it # in its computations if discount is not None: self.discount = float(discount) assert 0.0 < self.discount <= 1.0, ( "Discount rate must be in ]0; 1]" ) if self.discount == 1: print("WARNING: check conditions of convergence. With no " "discount, convergence can not be assumed.") # if the max_iter is None then the algorithm is assumed to not use it # in its computations if max_iter is not None: self.max_iter = int(max_iter) assert self.max_iter > 0, ( "The maximum number of iterations must be greater than 0." ) # check that epsilon is something sane if epsilon is not None: self.epsilon = float(epsilon) assert self.epsilon > 0, "Epsilon must be greater than 0." # this will fail for Kroneicker representation right now - but we can # write a new check function to make sure dimensions match if not skip_check: # We run a check on P and R to make sure they are describing an # MDP. If an exception isn't raised then they are assumed to be # correct. _util.check(transitions, reward) self.A = transitions.shape[0] print("There are",self.A,"actions") self.P = self._computeTransition(transitions) self.S = self.P[0].N print("The joint state space is size", self.S) self.R = self._computeReward(reward, transitions) # the verbosity is by default turned off self.verbose = False # Initially the time taken to perform the computations is set to None self.time = None # set the initial iteration count to zero self.iter = 0 # V should be stored as a vector ie shape of (S,) or (1, S) self.V = None # policy can also be stored as a vector self.policy = None
def __init__(self, transitions, reward, discount, epsilon, max_iter, skip_check=False): # Initialise a MDP based on the input parameters. # if the discount is None then the algorithm is assumed to not use it # in its computations if discount is not None: self.discount = float(discount) assert 0.0 < self.discount <= 1.0, ( "Discount rate must be in ]0; 1]" ) if self.discount == 1: print("WARNING: check conditions of convergence. With no " "discount, convergence can not be assumed.") # if the max_iter is None then the algorithm is assumed to not use it # in its computations if max_iter is not None: self.max_iter = int(max_iter) assert self.max_iter > 0, ( "The maximum number of iterations must be greater than 0." ) # check that epsilon is something sane if epsilon is not None: self.epsilon = float(epsilon) assert self.epsilon > 0, "Epsilon must be greater than 0." if not skip_check: # We run a check on P and R to make sure they are describing an # MDP. If an exception isn't raised then they are assumed to be # correct. _util.check(transitions, reward) self.S, self.A = _computeDimensions(transitions) self.P = self._computeTransition(transitions) self.R = self._computeReward(reward, transitions) # the verbosity is by default turned off self.verbose = False # Initially the time taken to perform the computations is set to None self.time = None # set the initial iteration count to zero self.iter = 0 # V should be stored as a vector ie shape of (S,) or (1, S) self.V = None # policy can also be stored as a vector self.policy = None
def __init__(self, transitions, reward, grid, start, goals, n_restarts=1000, alpha=0.2, gamma=0.9, rar=0.9, radr=0.99, n_iter=100000): # Initialise a Q-learning MDP. # The following check won't be done in MDP()'s initialisation, so let's # do it here self.max_iter = int(n_iter) #assert self.max_iter >= 10000, "'n_iter' should be greater than 10000." # We don't want to send this to MDP because _computePR should not be # run on it, so check that it defines an MDP _util.check(transitions, reward) # Store P, S, and A self.S, self.A = _computeDimensions(transitions) self.P = self._computeTransition(transitions) self.R = reward self.alpha = alpha self.gamma = gamma self.rar = rar self.orig_rar = rar self.radr = radr self.start = start self.goals = goals self.n_restarts = n_restarts # Initialisations self.Q = np.random.uniform(-1, 1, (self.S, self.A)) self.tracker = np.zeros(grid.shape) self.ncols = grid.shape[1] self.mean_discrepancy = []
def __init__(self, transitions, reward, discount, epsilon, max_iter): # Initialise a MDP based on the input parameters. # if the discount is None then the algorithm is assumed to not use it # in its computations if discount is not None: self.discount = float(discount) assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]" if self.discount == 1: print("WARNING: check conditions of convergence. With no " "discount, convergence can not be assumed.") # if the max_iter is None then the algorithm is assumed to not use it # in its computations if max_iter is not None: self.max_iter = int(max_iter) assert self.max_iter > 0, "The maximum number of iterations " \ "must be greater than 0." # check that epsilon is something sane if epsilon is not None: self.epsilon = float(epsilon) assert self.epsilon > 0, "Epsilon must be greater than 0." # we run a check on P and R to make sure they are describing an MDP. If # an exception isn't raised then they are assumed to be correct. _util.check(transitions, reward) self.S, self.A = _computeDimensions(transitions) self.P = self._computeTransition(transitions) self.R = self._computeReward(reward, transitions) # the verbosity is by default turned off self.verbose = False # Initially the time taken to perform the computations is set to None self.time = None # set the initial iteration count to zero self.iter = 0 # V should be stored as a vector ie shape of (S,) or (1, S) self.V = None # policy can also be stored as a vector self.policy = None