def test_deterministic_policy(): np.random.seed(88) n_dims = 5 approximator = Regressor(LinearApproximator, input_shape=(n_dims,), output_shape=(2,)) pi = DeterministicPolicy(approximator) w_new = np.random.rand(pi.weights_size) w_old = pi.get_weights() pi.set_weights(w_new) assert np.array_equal(w_new, approximator.get_weights()) assert not np.array_equal(w_old, w_new) assert np.array_equal(w_new, pi.get_weights()) s_test_1 = np.random.randn(5) s_test_2 = np.random.randn(5) a_test = approximator.predict(s_test_1) assert pi.get_regressor() == approximator assert pi(s_test_1, a_test) == 1 assert pi(s_test_2, a_test) == 0 a_stored = np.array([-1.86941072, -0.1789696]) assert np.allclose(pi.draw_action(s_test_1), a_stored)
def test_multivariate_state_std_gaussian(): np.random.seed(88) n_dims = 5 n_outs = 3 mu_approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(n_outs, )) std_approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(n_outs, )) pi = StateStdGaussianPolicy(mu_approximator, std_approximator) weights = np.random.rand(pi.weights_size) + .1 pi.set_weights(weights) x = np.random.randn(20, n_dims) for x_i in x: state = np.atleast_1d(x_i) action = pi.draw_action(state) exact_diff = pi.diff(state, action) numerical_diff = numerical_diff_policy(pi, state, action) assert np.allclose(exact_diff, numerical_diff)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target): self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) for i in range(len(self.target_approximator)): self.target_approximator[i].set_weights( self.approximator.get_weights() )
class AveragedDQN(AbstractDQN): """ Averaged-DQN algorithm. "Averaged-DQN: Variance Reduction and Stabilization for Deep Reinforcement Learning". Anschel O. et al.. 2017. """ def __init__(self, mdp_info, policy, approximator, n_approximators, **params): """ Constructor. Args: n_approximators (int): the number of target approximators to store. """ assert n_approximators > 1 self._n_approximators = n_approximators super().__init__(mdp_info, policy, approximator, **params) self._n_fitted_target_models = 1 self._add_save_attr(_n_fitted_target_models='primitive') def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target): self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) for i in range(len(self.target_approximator)): self.target_approximator[i].set_weights( self.approximator.get_weights() ) def _update_target(self): idx = self._n_updates // self._target_update_frequency\ % self._n_approximators self.target_approximator[idx].set_weights( self.approximator.get_weights()) if self._n_fitted_target_models < self._n_approximators: self._n_fitted_target_models += 1 def _next_q(self, next_state, absorbing): q = list() for idx in range(self._n_fitted_target_models): q.append(self.target_approximator.predict(next_state, idx=idx, **self._predict_params)) q = np.mean(q, axis=0) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1)
def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target): self.approximator = Regressor(approximator, n_models=self._n_approximators, prediction='min', **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, prediction='min', **apprx_params_target) self._update_target()
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, mdp.info.gamma))) if save_states_to_disk: # save normalization / plot states to disk path os.makedirs("./temp/", exist_ok=True) prepro.save_state("./temp/normalization_state") plotter.save_state("./temp/plotting_state") # load states from disk path prepro.load_state("./temp/normalization_state") plotter.load_state("./temp/plotting_state")
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def test_multivariate_gaussian(): np.random.seed(88) n_dims = 5 n_outs = 3 random_matrix = np.random.rand(n_outs, n_outs) sigma = random_matrix.dot(random_matrix.T) approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(n_outs, )) pi = GaussianPolicy(approximator, sigma) mu_weights = np.random.rand(pi.weights_size) pi.set_weights(mu_weights) x = np.random.randn(20, n_dims) for x_i in x: state = np.atleast_1d(x_i) action = pi.draw_action(state) exact_diff = pi.diff(state, action) numerical_diff = numerical_diff_policy(pi, state, action) assert np.allclose(exact_diff, numerical_diff)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 0.25 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=1e-2) algorithm_params = dict(optimizer=optimizer) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J), policy_weights=policy.get_weights().tolist())
def test_torch_ensemble_logger(tmpdir): np.random.seed(1) torch.manual_seed(1) logger = Logger('ensemble_logger', results_dir=tmpdir, use_timestamp=True) approximator = Regressor(TorchApproximator, input_shape=(4,), output_shape=(2,), n_models=3, network=ExampleNet, optimizer={'class': optim.Adam, 'params': {}}, loss=F.mse_loss, batch_size=100, quiet=True) approximator.set_logger(logger) x = np.random.rand(1000, 4) y = np.random.rand(1000, 2) for i in range(50): approximator.fit(x, y) loss_file = np.load(logger.path / 'loss.npy') assert loss_file.shape == (50, 3) assert np.allclose(loss_file[0], np.array([0.29083753, 0.86829887, 1.0505845])) and \ np.allclose(loss_file[-1], np.array([0.09410495, 0.18786799, 0.15016919]))
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J))
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), distribution_parameters=distribution.get_parameters()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info( i + 1, J=np.mean(J), distribution_parameters=distribution.get_parameters())
def test_cmac_approximator(): np.random.seed(1) # Generic regressor x = np.random.rand(1000, 2) k1 = np.random.rand(2) k2 = np.random.rand(2) y = np.array( [np.sin(x.dot(k1) * 2 * np.pi), np.sin(x.dot(k2) * 2 * np.pi)]).T tilings = Tiles.generate(10, [10, 10], np.zeros(2), np.ones(2)) approximator = Regressor(CMAC, tilings=tilings, input_shape=(2, ), output_shape=(2, )) approximator.fit(x, y) x = np.random.rand(2, 2) y_hat = approximator.predict(x) y_true = np.array( [np.sin(x.dot(k1) * 2 * np.pi), np.sin(x.dot(k2) * 2 * np.pi)]).T y_test = np.array([[-0.73581504, 0.90877225], [-0.95854488, -0.72429239]]) assert np.allclose(y_hat, y_test) point = np.random.rand(2) derivative = approximator.diff(point) assert np.array_equal(np.sum(derivative, axis=0), np.ones(2) * 10) assert len(derivative) == approximator.weights_size old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) # Action regressor + Ensemble n_actions = 2 s = np.random.rand(1000, 3) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) tilings = Tiles.generate(10, [10, 10, 10], np.zeros(3), np.ones(3)) approximator = Regressor(CMAC, tilings=tilings, input_shape=(3, ), n_actions=n_actions, n_models=5) approximator.fit(s, a, q) np.random.seed(2) x_s = np.random.rand(2, 3) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a, prediction='mean') y_test = np.array([[0.56235045, 0.25080909]]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='sum') y_test = np.array([2.81175226, 1.25404543]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='min') y_test = np.array([0.56235045, 0.25080909]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.10367145, 0.56235045], [0.05575822, 0.25080909]]) assert np.allclose(y, y_test)
def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target): self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) self._update_target()
class AbstractDQN(Agent): def __init__(self, mdp_info, policy, approximator, approximator_params, batch_size, target_update_frequency, replay_memory=None, initial_replay_size=500, max_replay_size=5000, fit_params=None, clip_reward=False): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; approximator_params (dict): parameters of the approximator to build; batch_size ((int, Parameter)): the number of samples in a batch; target_update_frequency (int): the number of samples collected between each update of the target network; replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the object of the replay memory to use; if None, a default replay memory is created; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; fit_params (dict, None): parameters of the fitting algorithm of the approximator; clip_reward (bool, False): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = to_parameter(batch_size) self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency if replay_memory is not None: self._replay_memory = replay_memory if isinstance(replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard else: self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._fit = self._fit_standard self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self._initialize_regressors(approximator, apprx_params_train, apprx_params_target) policy.set_q(self.approximator) self._add_save_attr( _fit_params='pickle', _batch_size='mushroom', _n_approximators='primitive', _clip_reward='primitive', _target_update_frequency='primitive', _replay_memory='mushroom', _n_updates='primitive', approximator='mushroom', target_approximator='mushroom' ) super().__init__(mdp_info, policy) def fit(self, dataset): self._fit(dataset) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _fit_standard(self, dataset, approximator=None): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size()) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next if approximator is None: self.approximator.fit(state, action, q, **self._fit_params) else: approximator.fit(state, action, q, **self._fit_params) def _fit_prioritized(self, dataset, approximator=None): self._replay_memory.add( dataset, np.ones(len(dataset)) * self._replay_memory.max_priority) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, idxs, is_weight = \ self._replay_memory.get(self._batch_size()) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next td_error = q - self.approximator.predict(state, action) self._replay_memory.update(td_error, idxs) if approximator is None: self.approximator.fit(state, action, q, weights=is_weight, **self._fit_params) else: approximator.fit(state, action, q, weights=is_weight, **self._fit_params) def draw_action(self, state): action = super().draw_action(np.array(state)) return action def _initialize_regressors(self, approximator, apprx_params_train, apprx_params_target): self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.set_weights(self.approximator.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ raise NotImplementedError def _post_load(self): if isinstance(self._replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard self.policy.set_q(self.approximator)
def test_linear_approximator(): np.random.seed(1) # Generic regressor a = np.random.rand(1000, 3) k = np.random.rand(3, 2) b = a.dot(k) + np.random.randn(1000, 2) approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(2, )) approximator.fit(a, b) x = np.random.rand(2, 3) y = approximator.predict(x) y_test = np.array([[0.57638247, 0.1573216], [0.11388247, 0.24123678]]) assert np.allclose(y, y_test) point = np.random.randn(3, ) derivative = approximator.diff(point) lp = len(point) for i in range(derivative.shape[1]): assert (derivative[i * lp:(i + 1) * lp, i] == point).all() old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) # Action regressor + Ensemble n_actions = 2 s = np.random.rand(1000, 3) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) approximator = Regressor(LinearApproximator, input_shape=(3, ), n_actions=n_actions, n_models=5) approximator.fit(s, a, q) x_s = np.random.rand(2, 3) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a, prediction='mean') y_test = np.array([0.49225698, 0.69660881]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='sum') y_test = np.array([2.46128492, 3.48304404]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='min') y_test = np.array([[0.49225698, 0.69660881]]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.49225698, 0.44154141], [0.69660881, 0.69060195]]) assert np.allclose(y, y_test) approximator = Regressor(LinearApproximator, input_shape=(3, ), n_actions=n_actions) approximator.fit(s, a, q) gradient = approximator.diff(x_s[0], x_a[0]) gradient_test = np.array([0.88471362, 0.11666548, 0.45466254, 0., 0., 0.]) assert np.allclose(gradient, gradient_test)
def __init__(self, mdp_info, policy, approximator, approximator_params, batch_size, target_update_frequency, replay_memory=None, initial_replay_size=500, max_replay_size=5000, fit_params=None, n_approximators=1, clip_reward=True): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; approximator_params (dict): parameters of the approximator to build; batch_size (int): the number of samples in a batch; target_update_frequency (int): the number of samples collected between each update of the target network; replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the object of the replay memory to use; if None, a default replay memory is created; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; fit_params (dict, None): parameters of the fitting algorithm of the approximator; n_approximators (int, 1): the number of approximator to use in ``AveragedDQN``; clip_reward (bool, True): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency if replay_memory is not None: self._replay_memory = replay_memory if isinstance(replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard else: self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._fit = self._fit_standard self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.set_weights( self.approximator.get_weights()) else: for i in range(self._n_approximators): self.target_approximator[i].set_weights( self.approximator.get_weights()) self._add_save_attr(_fit_params='pickle', _batch_size='primitive', _n_approximators='primitive', _clip_reward='primitive', _target_update_frequency='primitive', _replay_memory='mushroom', _n_updates='primitive', approximator='mushroom', target_approximator='mushroom') super().__init__(mdp_info, policy)
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, mdp_info, policy, approximator, approximator_params, batch_size, target_update_frequency, replay_memory=None, initial_replay_size=500, max_replay_size=5000, fit_params=None, n_approximators=1, clip_reward=True): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; approximator_params (dict): parameters of the approximator to build; batch_size (int): the number of samples in a batch; target_update_frequency (int): the number of samples collected between each update of the target network; replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the object of the replay memory to use; if None, a default replay memory is created; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; fit_params (dict, None): parameters of the fitting algorithm of the approximator; n_approximators (int, 1): the number of approximator to use in ``AveragedDQN``; clip_reward (bool, True): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency if replay_memory is not None: self._replay_memory = replay_memory if isinstance(replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard else: self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._fit = self._fit_standard self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.set_weights( self.approximator.get_weights()) else: for i in range(self._n_approximators): self.target_approximator[i].set_weights( self.approximator.get_weights()) self._add_save_attr(_fit_params='pickle', _batch_size='primitive', _n_approximators='primitive', _clip_reward='primitive', _target_update_frequency='primitive', _replay_memory='mushroom', _n_updates='primitive', approximator='mushroom', target_approximator='mushroom') super().__init__(mdp_info, policy) def fit(self, dataset): self._fit(dataset) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _fit_standard(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) def _fit_prioritized(self, dataset): self._replay_memory.add( dataset, np.ones(len(dataset)) * self._replay_memory.max_priority) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, idxs, is_weight = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next td_error = q - self.approximator.predict(state, action) self._replay_memory.update(td_error, idxs) self.approximator.fit(state, action, q, weights=is_weight, **self._fit_params) def _update_target(self): """ Update the target network. """ self.target_approximator.set_weights(self.approximator.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): action = super(DQN, self).draw_action(np.array(state)) return action def _post_load(self): if isinstance(self._replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard self.policy.set_q(self.approximator)
def test_cmac_approximator(): np.random.seed(1) # Generic regressor x = np.random.rand(1000, 2) k1 = np.random.rand(2) k2 = np.random.rand(2) y = np.array( [np.sin(x.dot(k1) * 2 * np.pi), np.sin(x.dot(k2) * 2 * np.pi)]).T tilings = Tiles.generate(10, [10, 10], np.zeros(2), np.ones(2)) approximator = Regressor(CMAC, tilings=tilings, input_shape=(2, ), output_shape=(2, )) approximator.fit(x, y) x = np.random.rand(2, 2) y_hat = approximator.predict(x) y_true = np.array( [np.sin(x.dot(k1) * 2 * np.pi), np.sin(x.dot(k2) * 2 * np.pi)]).T y_test = np.array([[-0.73787754, 0.90673493], [-0.94972964, -0.72380013]]) assert np.allclose(y_hat, y_test) point = np.random.rand(2) derivative = approximator.diff(point) assert np.array_equal(np.sum(derivative, axis=0), np.ones(2) * 10) assert len(derivative) == approximator.weights_size old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) # Action regressor + Ensemble n_actions = 2 s = np.random.rand(1000, 3) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) tilings = Tiles.generate(10, [10, 10, 10], np.zeros(3), np.ones(3)) approximator = Regressor(CMAC, tilings=tilings, input_shape=(3, ), n_actions=n_actions, n_models=5) approximator.fit(s, a, q) x_s = np.random.rand(2, 3) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a, prediction='mean') y_test = np.array([[0.10921918, 0.09923379]]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='sum') y_test = np.array([0.54609592, 0.49616895]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='min') y_test = np.array([[0.10921918, 0.09923379]]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.07606651, 0.10921918], [0.40698114, 0.09923379]]) assert np.allclose(y, y_test)
def test_pytorch_approximator(): np.random.seed(1) torch.manual_seed(1) n_actions = 2 s = np.random.rand(1000, 4) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) approximator = Regressor(TorchApproximator, input_shape=(4,), output_shape=(2,), n_actions=n_actions, network=ExampleNet, optimizer={'class': optim.Adam, 'params': {}}, loss=F.mse_loss, batch_size=100, quiet=True) approximator.fit(s, a, q, n_epochs=20) x_s = np.random.rand(2, 4) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a) y_test = np.array([0.37191153, 0.5920861]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.47908658, 0.37191153], [0.5920861, 0.27575058]]) assert np.allclose(y, y_test) gradient = approximator.diff(x_s[0], x_a[0]) gradient_test = np.array([0., 0., 0., 0., 0.02627479, 0.76513696, 0.6672573, 0.35979462, 0., 1.]) assert np.allclose(gradient, gradient_test) gradient = approximator.diff(x_s[0]) gradient_test = np.array([[0.02627479, 0.], [0.76513696, 0.], [0.6672573, 0.], [0.35979462, 0.], [0., 0.02627479], [0., 0.76513696], [0., 0.6672573], [0., 0.35979462], [1, 0.], [0., 1.]]) assert np.allclose(gradient, gradient_test) old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights))