def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def test_V_lqr_gaussian_policy_gradient_K_diff_dims(): A = np.array([[1., 0.4], [0.2, 0.8]]) B = np.array([[0.8], [0.5]]) Q = np.eye(2) R = np.eye(1) lqr = LQR(A, B, Q, R, max_pos=np.inf, max_action=np.inf, random_init=False, episodic=False, gamma=0.9, horizon=100, initial_state=None) K = np.array([[1.0, 0.1]]) Sigma = np.array([[0.2]]) s = np.array([1.0, 1.3]) dJ = compute_lqr_V_gaussian_policy_gradient_K(s, lqr, K, Sigma) f = lambda theta: compute_lqr_V_gaussian_policy(s, lqr, theta.reshape(K.shape), Sigma) dJ_num = numerical_diff_function(f, K.reshape(-1)) assert np.allclose(dJ, dJ_num)
def test_lqr_solver_linear(): lqr = LQR.generate(3) K = compute_lqr_feedback_gain(lqr) K_test = np.array([[0.89908343, 0., 0.], [0., 0.24025307, 0.], [0., 0., 0.24025307]]) assert np.allclose(K, K_test)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
def test_V_lqr(): lqr = LQR.generate(3) K = np.array([[1.0, 0.1, 0.01], [0.5, 1.2, 0.02], [.02, 0.3, 0.9]]) s = np.array([1.0, 1.3, -0.3]) V_lqr = compute_lqr_V(s, lqr, K).item() assert np.allclose(V_lqr, -6.3336186348534875)
def test_Q_lqr_gaussian_policy_10dim(): lqr = LQR.generate(10) K = np.eye(10) * 0.1 Sigma = np.eye(10) * 0.1 s = np.ones(10) a = np.ones(10) # Q_lqg = compute_lqr_Q_gaussian_policy(s, a, lqr, K, Sigma).item() assert np.allclose(Q_lqg, -48.00590405904062)
def test_Q_lqr(): lqr = LQR.generate(3) K = np.array([[1.0, 0.1, 0.01], [0.5, 1.2, 0.02], [.02, 0.3, 0.9]]) s = np.array([1.0, 1.3, -0.3]) a = np.array([0.5, 0.2, 0.1]) Q_lqr = compute_lqr_Q(s, a, lqr, K).item() assert np.allclose(Q_lqr, -10.83964921837036)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, mdp.info.gamma))) if save_states_to_disk: # save normalization / plot states to disk path os.makedirs("./temp/", exist_ok=True) prepro.save_state("./temp/normalization_state") plotter.save_state("./temp/plotting_state") # load states from disk path prepro.load_state("./temp/normalization_state") plotter.load_state("./temp/plotting_state")
def test_V_lqr_gaussian_policy(): lqr = LQR.generate(3) K = np.array([[1.0, 0.1, 0.01], [0.5, 1.2, 0.02], [.02, 0.3, 0.9]]) Sigma = np.array([[0.18784063, 0.02205161, 0.19607835], [0.02205161, 0.59897771, 0.09953863], [0.19607835, 0.09953863, 0.23284475]]) s = np.array([1.0, 1.3, -0.3]) V_lqg = compute_lqr_V_gaussian_policy(s, lqr, K, Sigma) assert np.allclose(V_lqg, -28.39165320182624)
def test_P(): lqr = LQR.generate(3) K = np.array( [[1.0, 0.1, 0.01], [0.5, 1.2, 0.02], [.02, 0.3, 0.9]] ) P = compute_lqr_P(lqr, K) P_test = np.array([[1.60755632, 0.78058807, 0.03219049], [0.78058807, 1.67738666, 0.24905620], [0.03219049, 0.2490562 , 0.83697781]]) assert np.allclose(P, P_test)
def test_Q_lqr_gaussian_policy(): lqr = LQR.generate(3) K = np.array([[1.0, 0.1, 0.01], [0.5, 1.2, 0.02], [.02, 0.3, 0.9]]) Sigma = np.array([[0.18784063, 0.02205161, 0.19607835], [0.02205161, 0.59897771, 0.09953863], [0.19607835, 0.09953863, 0.23284475]]) s = np.array([1.0, 1.3, -0.3]) a = np.array([-0.5, -0.2, 0.1]) Q_lqg = compute_lqr_Q_gaussian_policy(s, a, lqr, K, Sigma).item() assert np.allclose(Q_lqg, -23.887098201718487)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 0.25 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=1e-2) algorithm_params = dict(optimizer=optimizer) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J), policy_weights=policy.get_weights().tolist())
def test_V_lqr_gaussian_policy_gradient_K(): lqr = LQR.generate(3) K = np.array([[1.0, 0.1, 0.01], [0.5, 1.2, 0.02], [.02, 0.3, 0.9]]) Sigma = np.array([[0.18784063, 0.02205161, -0.19607835], [0.02205161, 0.59897771, 0.09953863], [-0.19607835, 0.09953863, 0.23284475]]) s = np.array([1.0, 1.3, -0.3]) dJ = compute_lqr_V_gaussian_policy_gradient_K(s, lqr, K, Sigma) f = lambda theta: compute_lqr_V_gaussian_policy(s, lqr, theta.reshape(K.shape), Sigma) dJ_num = numerical_diff_function(f, K.reshape(-1)) assert np.allclose(dJ, dJ_num)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), distribution_parameters=distribution.get_parameters()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info( i + 1, J=np.mean(J), distribution_parameters=distribution.get_parameters())
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def learn(alg, **alg_params): np.random.seed(1) torch.manual_seed(1) # MDP mdp = LQR.generate(dimensions=2) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, distribution, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=5, n_episodes_per_fit=5) return agent