Пример #1
0
    def __init__(self,
                 transitions,
                 reward,
                 discount,
                 eps=0.01,
                 n_iter=10000,
                 skip_check=False):
        # Initialise a Q-learning MDP.

        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        self.max_iter = int(n_iter)
        assert self.max_iter >= 10000, "'n_iter' should be greater than 10000."

        if not skip_check:
            # We don't want to send this to MDP because _computePR should not
            #  be run on it, so check that it defines an MDP
            _util.check(transitions, reward)

        # Store P, S, and A
        self.S, self.A = _computeDimensions(transitions)
        self.P = self._computeTransition(transitions)

        self.R = reward

        self.discount = discount
        self.eps = eps

        # Initialisations
        self.Q = _np.zeros((self.S, self.A))
        self.mean_discrepancy = []
Пример #2
0
    def __init__(self, transitions, reward, discount, start_state, goal_states, n_iter=10000, num_restarts=100, timeout_iters=1000):
        # Initialise a Q-learning MDP.

        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        self.max_iter = int(n_iter)
        assert self.max_iter >= 10000, "'n_iter' should be greater than 10000."

        # We don't want to send this to MDP because _computePR should not be
        # run on it, so check that it defines an MDP
        _util.check(transitions, reward)

        # Store P, S, and A
        self.S, self.A = _computeDimensions(transitions)
        self.P = self._computeTransition(transitions)

        self.R = reward

        self.discount = discount

        self.start_state = start_state
        
        self.goal_states = goal_states
        
        self.num_restarts = num_restarts
        
        self.timeout_iters = timeout_iters
        # Initialisations
        self.Q = _np.zeros((self.S, self.A))
        self.mean_discrepancy = []
Пример #3
0
    def __init__(self, transitions, reward, grid, start, goals, n_restarts=1000, alpha = 0.2, gamma = 0.9, rar = 0.9, radr = 0.99, n_iter=100000):
        # Initialise a Q-learning MDP.

        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        self.max_iter = int(n_iter)
        assert self.max_iter >= 10000, "'n_iter' should be greater than 10000." 

        # We don't want to send this to MDP because _computePR should not be
        # run on it, so check that it defines an MDP
        _util.check(transitions, reward)

        # Store P, S, and A
        self.S, self.A = _computeDimensions(transitions)
        self.P = self._computeTransition(transitions)
        
        self.R = reward
        self.alpha = alpha
        self.gamma = gamma
        self.rar = rar
        self.orig_rar = rar
        self.radr = radr
        self.start = start
        self.goals = goals
        self.n_restarts = n_restarts
        
        # Initialisations
        self.Q = np.random.uniform(-1, 1, (self.S, self.A))
        self.tracker = np.zeros(grid.shape)
        self.ncols = grid.shape[1]
        self.mean_discrepancy = []
Пример #4
0
    def __init__(self, transitions, reward, discount, epsilon, max_iter,
                 skip_check=True, sparse=False):
        # Initialise a MDP based on the input parameters.
        self.sparse = sparse

        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, (
                "Discount rate must be in ]0; 1]"
            )
            if self.discount == 1:
                print("WARNING: check conditions of convergence. With no "
                      "discount, convergence can not be assumed.")

        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, (
                "The maximum number of iterations must be greater than 0."
            )

        # check that epsilon is something sane
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."

        # this will fail for Kroneicker representation right now - but we can
        # write a new check function to make sure dimensions match
        if not skip_check:
            # We run a check on P and R to make sure they are describing an
            # MDP. If an exception isn't raised then they are assumed to be
            # correct.
            _util.check(transitions, reward)

        self.A = transitions.shape[0]
        print("There are",self.A,"actions")
        self.P = self._computeTransition(transitions)
        self.S = self.P[0].N
        print("The joint state space is size", self.S)
        self.R = self._computeReward(reward, transitions)

        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
        # set the initial iteration count to zero
        self.iter = 0
        # V should be stored as a vector ie shape of (S,) or (1, S)
        self.V = None
        # policy can also be stored as a vector
        self.policy = None
Пример #5
0
    def __init__(self, transitions, reward, discount, epsilon, max_iter,
                 skip_check=False):
        # Initialise a MDP based on the input parameters.

        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, (
                "Discount rate must be in ]0; 1]"
            )
            if self.discount == 1:
                print("WARNING: check conditions of convergence. With no "
                      "discount, convergence can not be assumed.")

        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, (
                "The maximum number of iterations must be greater than 0."
            )

        # check that epsilon is something sane
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."

        if not skip_check:
            # We run a check on P and R to make sure they are describing an
            # MDP. If an exception isn't raised then they are assumed to be
            # correct.
            _util.check(transitions, reward)

        self.S, self.A = _computeDimensions(transitions)
        self.P = self._computeTransition(transitions)
        self.R = self._computeReward(reward, transitions)

        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
        # set the initial iteration count to zero
        self.iter = 0
        # V should be stored as a vector ie shape of (S,) or (1, S)
        self.V = None
        # policy can also be stored as a vector
        self.policy = None
Пример #6
0
    def __init__(self,
                 transitions,
                 reward,
                 grid,
                 start,
                 goals,
                 n_restarts=1000,
                 alpha=0.2,
                 gamma=0.9,
                 rar=0.9,
                 radr=0.99,
                 n_iter=100000):
        # Initialise a Q-learning MDP.

        # The following check won't be done in MDP()'s initialisation, so let's
        # do it here
        self.max_iter = int(n_iter)
        #assert self.max_iter >= 10000, "'n_iter' should be greater than 10000."

        # We don't want to send this to MDP because _computePR should not be
        # run on it, so check that it defines an MDP
        _util.check(transitions, reward)

        # Store P, S, and A
        self.S, self.A = _computeDimensions(transitions)
        self.P = self._computeTransition(transitions)

        self.R = reward
        self.alpha = alpha
        self.gamma = gamma
        self.rar = rar
        self.orig_rar = rar
        self.radr = radr
        self.start = start
        self.goals = goals
        self.n_restarts = n_restarts

        # Initialisations
        self.Q = np.random.uniform(-1, 1, (self.S, self.A))
        self.tracker = np.zeros(grid.shape)
        self.ncols = grid.shape[1]
        self.mean_discrepancy = []
Пример #7
0
    def __init__(self, transitions, reward, discount, epsilon, max_iter):
        # Initialise a MDP based on the input parameters.

        # if the discount is None then the algorithm is assumed to not use it
        # in its computations
        if discount is not None:
            self.discount = float(discount)
            assert 0.0 < self.discount <= 1.0, "Discount rate must be in ]0; 1]"
            if self.discount == 1:
                print("WARNING: check conditions of convergence. With no "
                      "discount, convergence can not be assumed.")

        # if the max_iter is None then the algorithm is assumed to not use it
        # in its computations
        if max_iter is not None:
            self.max_iter = int(max_iter)
            assert self.max_iter > 0, "The maximum number of iterations " \
                                      "must be greater than 0."

        # check that epsilon is something sane
        if epsilon is not None:
            self.epsilon = float(epsilon)
            assert self.epsilon > 0, "Epsilon must be greater than 0."

        # we run a check on P and R to make sure they are describing an MDP. If
        # an exception isn't raised then they are assumed to be correct.
        _util.check(transitions, reward)
        self.S, self.A = _computeDimensions(transitions)
        self.P = self._computeTransition(transitions)
        self.R = self._computeReward(reward, transitions)

        # the verbosity is by default turned off
        self.verbose = False
        # Initially the time taken to perform the computations is set to None
        self.time = None
        # set the initial iteration count to zero
        self.iter = 0
        # V should be stored as a vector ie shape of (S,) or (1, S)
        self.V = None
        # policy can also be stored as a vector
        self.policy = None