def __init__(self, mdp_info, policy, learning_rate, off_policy=False, beta=None, delta=None): """ Constructor. Args: off_policy (bool, False): whether to use the off policy setting or the online one; beta (Parameter, None): beta coefficient; delta (Parameter, None): delta coefficient. """ self.off_policy = off_policy if delta is not None and beta is None: self.delta = delta self.beta = None elif delta is None and beta is not None: self.delta = None self.beta = beta else: raise ValueError('delta or beta parameters needed.') self.Q = Table(mdp_info.size) self.Q_tilde = Table(mdp_info.size) self.R_tilde = Table(mdp_info.size) super().__init__(mdp_info, policy, self.Q, learning_rate)
def test_boltzmann(): np.random.seed(88) beta = Parameter(0.1) pi = Boltzmann(beta) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.30676679, 0.36223227, 0.33100094]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.36223227]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 2 assert a.item() == a_test beta_2 = LinearParameter(0.2, 0.1, 2) pi.set_beta(beta_2) p_sa_2 = pi(s, a) assert p_sa_2 < p_sa pi.update(s, a) p_sa_3 = pi(s, a) p_sa_3_test = np.array([0.33100094]) assert np.allclose(p_sa_3, p_sa_3_test)
def __init__(self, value, exponential=False, min_value=None, tol=1., window=100, size=(1, )): """ Constructor. Args: tol (float): value of the variance of the target variable such that the parameter value is 0.5. window (int): """ self._exponential = exponential self._tol = tol self._weights_var = Table(size) self._samples = Table(size + (window, )) self._index = Table(size, dtype=int) self._window = window self._parameter_value = Table(size) self._add_save_attr( _exponential='primitive', _tol='primitive', _weights_var='mushroom', _samples='mushroom', _index='mushroom', _window='primitive', _parameter_value='mushroom', ) super(WindowedVarianceParameter, self).__init__(value, min_value, size)
def test_eps_greedy(): np.random.seed(88) eps = Parameter(0.1) pi = EpsGreedy(eps) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.03333333, 0.93333333, 0.03333333]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.93333333]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 1 assert a.item() == a_test eps_2 = LinearParameter(0.2, 0.1, 2) pi.set_epsilon(eps_2) p_sa_2 = pi(s, a) assert p_sa_2 < p_sa pi.update(s, a) pi.update(s, a) p_sa_3 = pi(s, a) print(eps_2.get_value()) assert p_sa_3 == p_sa
def __init__(self, value, exponential=False, min_value=None, tol=1., size=(1, )): """ Constructor. Args: tol (float): value of the variance of the target variable such that The parameter value is 0.5. """ self._exponential = exponential self._tol = tol self._weights_var = Table(size) self._x = Table(size) self._x2 = Table(size) self._parameter_value = Table(size) super().__init__(value, min_value, size) self._add_save_attr( _exponential='primitive', _tol='primitive', _weights_var='mushroom', _x='mushroom', _x2='mushroom', _parameter_value='mushroom', )
def __init__(self, value, min_value=None, max_value=None, size=(1, )): """ Constructor. Args: value (float): initial value of the parameter; min_value (float, None): minimum value that the parameter can reach when decreasing; max_value (float, None): maximum value that the parameter can reach when increasing; size (tuple, (1,)): shape of the matrix of parameters; this shape can be used to have a single parameter for each state or state-action tuple. """ self._initial_value = value self._min_value = min_value self._max_value = max_value self._n_updates = Table(size) self._add_save_attr( _initial_value='primitive', _min_value='primitive', _max_value='primitive', _n_updates='mushroom', )
def __init__(self, mdp_info, policy, learning_rate): Q = Table(mdp_info.size) self.old_q = deepcopy(Q) self._add_save_attr(old_q='mushroom') super().__init__(mdp_info, policy, Q, learning_rate)
def test_td_policy(): Q = Table((10, 3)) pi = TDPolicy() pi.set_q(Q) assert Q == pi.get_q()
def __init__(self, mdp_info, policy, learning_rate): self.Q = Table(mdp_info.size) self.old_q = deepcopy(self.Q) self._add_save_attr(Q='pickle', old_q='pickle') super().__init__(mdp_info, policy, self.Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, sampling=True, precision=1000): """ Constructor. Args: sampling (bool, True): use the approximated version to speed up the computation; precision (int, 1000): number of samples to use in the approximated version. """ self.Q = Table(mdp_info.size) self._sampling = sampling self._precision = precision self._add_save_attr( Q='pickle', _sampling='numpy', _precision='numpy', _n_updates='pickle', _sigma='pickle', _Q='pickle', _Q2='pickle', _weights_var='pickle', _w='numpy' ) super().__init__(mdp_info, policy, self.Q, learning_rate) self._n_updates = Table(mdp_info.size) self._sigma = Table(mdp_info.size, initial_value=1e10) self._Q = Table(mdp_info.size) self._Q2 = Table(mdp_info.size) self._weights_var = Table(mdp_info.size)
def test_mellowmax(): np.random.seed(88) omega = Parameter(3) pi = Mellowmax(omega) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.08540336, 0.69215916, 0.22243748]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.69215916]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 2 assert a.item() == a_test try: beta = Parameter(0.1) pi.set_beta(beta) except RuntimeError: pass else: assert False try: pi.update(s,a) except RuntimeError: pass else: assert False
def __init__(self, mdp_info, policy, learning_rate, beta): """ Constructor. Args: beta (Parameter): beta coefficient. """ self.Q = Table(mdp_info.size) self._rho = 0. self.beta = beta super().__init__(mdp_info, policy, self.Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, off_policy=False, beta=None, delta=None): """ Constructor. Args: off_policy (bool, False): whether to use the off policy setting or the online one; beta ([float, Parameter], None): beta coefficient; delta ([float, Parameter], None): delta coefficient. """ self.off_policy = off_policy if delta is not None and beta is None: self.delta = to_parameter(delta) self.beta = None elif delta is None and beta is not None: self.delta = None self.beta = to_parameter(beta) else: raise ValueError('delta or beta parameters needed.') Q = Table(mdp_info.size) self.Q_tilde = Table(mdp_info.size) self.R_tilde = Table(mdp_info.size) self._add_save_attr(off_policy='primitive', delta='mushroom', beta='mushroom', Q_tilde='mushroom', R_tilde='mushroom') super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, beta): """ Constructor. Args: beta (Parameter): beta coefficient. """ Q = Table(mdp_info.size) self._rho = 0. self.beta = beta self._add_save_attr(_rho='primitive', beta='pickle') super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, beta): """ Constructor. Args: beta ((float, Parameter)): beta coefficient. """ Q = Table(mdp_info.size) self._rho = 0. self._beta = to_parameter(beta) self._add_save_attr(_rho='primitive', _beta='mushroom') super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, trace='replacing'): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient; trace (str, 'replacing'): type of eligibility trace to use. """ self.Q = Table(mdp_info.size) self._lambda = lambda_coeff self.e = EligibilityTrace(self.Q.shape, trace) super().__init__(mdp_info, policy, self.Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, trace='replacing'): """ Constructor. Args: lambda_coeff ((float, Parameter)): eligibility trace coefficient; trace (str, 'replacing'): type of eligibility trace to use. """ Q = Table(mdp_info.size) self._lambda = to_parameter(lambda_coeff) self.e = EligibilityTrace(Q.shape, trace) self._add_save_attr( _lambda='mushroom', e='mushroom' ) super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate, lambda_coef, trace='replacing'): """ Constructor. Args: lambda_coef (float): eligibility trace coefficient; trace (str, 'replacing'): type of eligibility trace to use. """ Q = Table(mdp_info.size) self._lambda = lambda_coef self.e = EligibilityTrace(Q.shape, trace) self._add_save_attr( _lambda='primitive', e='pickle' ) super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate): Q = Table(mdp_info.size) super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, learning_rate): self.Q = Table(mdp_info.size) self.old_q = deepcopy(self.Q) super().__init__(mdp_info, policy, self.Q, learning_rate)