def test_PGPE(): distribution = learn(PGPE, optimizer=AdaptiveOptimizer(1.5)).distribution w = distribution.get_parameters() w_test = np.array([ 0.02489092, 0.31062211, 0.2051433, 0.05959651, -0.78302236, 0.77381954, 0.23676176, -0.29855654 ]) assert np.allclose(w, w_test)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
def test_PGPE_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format( datetime.now().strftime("%H%M%S%f")) agent_save = learn(PGPE, optimizer=AdaptiveOptimizer(1.5)) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(learning_rate=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, mdp.info.gamma))) if save_states_to_disk: # save normalization / plot states to disk path os.makedirs("./temp/", exist_ok=True) prepro.save_state("./temp/normalization_state") plotter.save_state("./temp/plotting_state") # load states from disk path prepro.load_state("./temp/normalization_state") plotter.load_state("./temp/plotting_state")
def test_GPOMDP_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format( datetime.now().strftime("%H%M%S%f")) params = dict(optimizer=AdaptiveOptimizer(eps=.01)) agent_save = learn(GPOMDP, params) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 0.25 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=1e-2) algorithm_params = dict(optimizer=optimizer) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J), policy_weights=policy.get_weights().tolist())
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True) if __name__ == '__main__': algs_params = [ (REPS, {'eps': 0.05}), (RWR, {'beta': 0.01}), (PGPE, {'optimizer': AdaptiveOptimizer(eps=0.3)}), ] for alg, params in algs_params: experiment(alg, params, n_epochs=20, n_episodes=100, n_ep_per_fit=25)
def test_eNAC(): params = dict(optimizer=AdaptiveOptimizer(eps=.01)) policy = learn(eNAC, params).policy w = np.array([-0.03668018, 2.05112355]) assert np.allclose(w, policy.get_weights())
def test_GPOMDP(): params = dict(optimizer=AdaptiveOptimizer(eps=.01)) policy = learn(GPOMDP, params).policy w = np.array([-0.07623939, 2.05232858]) assert np.allclose(w, policy.get_weights())
# Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J)) if __name__ == '__main__': algs_params = [ (REPS, { 'eps': 1.0 }), (RWR, { 'beta': 0.7 }), (PGPE, { 'optimizer': AdaptiveOptimizer(eps=1.5) }), ] for alg, params in algs_params: experiment(alg, params, n_epochs=25, fit_per_epoch=10, ep_per_fit=20)
def test_eNAC(): params = dict(optimizer=AdaptiveOptimizer(eps=.01)) policy = learn(eNAC, params).policy w = np.array([-0.16169364, 2.00594995]) assert np.allclose(w, policy.get_weights())
def test_GPOMDP(): params = dict(optimizer=AdaptiveOptimizer(eps=.01)) policy = learn(GPOMDP, params).policy w = np.array([-0.11457566, 1.99784316]) assert np.allclose(w, policy.get_weights())
distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) if __name__ == '__main__': optimizer = AdaptiveOptimizer(eps=0.05) algs = [REPS, RWR, PGPE] params = [{'eps': 0.5}, {'beta': 0.7}, {'optimizer': optimizer}] for alg, params in zip(algs, params): print(alg.__name__) experiment(alg, params, n_epochs=4, fit_per_run=10, ep_per_run=100)
def test_REINFORCE(): params = dict(optimizer=AdaptiveOptimizer(eps=.01)) policy = learn(REINFORCE, params).policy w = np.array([-0.0084793, 2.00536528]) assert np.allclose(w, policy.get_weights())
render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) logger.info('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True) if __name__ == '__main__': algs_params = [ (REPS, { 'eps': 0.05 }), (RWR, { 'beta': 0.01 }), (PGPE, { 'optimizer': AdaptiveOptimizer(eps=0.3) }), ] for alg, params in algs_params: experiment(alg, params, n_epochs=20, n_episodes=100, n_ep_per_fit=25)
sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i+1, J=np.mean(J)) if __name__ == '__main__': algs_params = [ (REPS, {'eps': 1.0}), (RWR, {'beta': 0.7}), (PGPE, {'optimizer': AdaptiveOptimizer(eps=1.5)}), ] for alg, params in algs_params: experiment(alg, params, n_epochs=25, fit_per_epoch=10, ep_per_fit=20)