def test_sarsa_lambda_continuous_nn_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def learn(alg, alg_params): mdp = LQR.generate(dimensions=1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return policy
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(mdp.info, pi, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J start:', J) # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J final:', J)
def learn_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) return agent
def experiment(algorithm_class, exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialParameter(value=1., exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get() return Qs
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0., -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -16.62627886, 0., -13.03033079, 0., -15.93237930, 0., -9.72299176, 0., -13.78884631, 0., -9.92157645, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -17.30427303, 0., -13.54157504, 0., -16.82373134, 0., -10.29613337, 0., -14.79470382, 0., -10.50654665, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def learn(alg, alg_params): mdp = InvertedPendulum(horizon=50) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return agent
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0., -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test): np.random.seed(1) # MDP gamma = 0.99 horizon = 2000 mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles) # Agent agent = create_SAC_agent(mdp) # normalization callback normalizer = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info) # Algorithm(with normalization and plotting) core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer]) # training loop for n in range(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_episodes=n_episodes_test, render=True) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, gamma)), ' Len_ep: ', int(np.round(np.mean(episodes_length(dataset))))) print('Press a button to visualize humanoid') input() core.evaluate(n_episodes=10, render=True)
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return agent, np.mean(compute_J(dataset, mdp.info.gamma))
def test_true_online_sarsa_lambda_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def test_collect_Q(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(mdp.info, pi, alpha) callback_q = CollectQ(agent.Q) callback_max_q = CollectMaxQ(agent.Q, np.array([2])) core = Core(agent, mdp, callbacks=[callback_q, callback_max_q]) core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True) V_test = np.array([2.4477574, 0.02246188, 1.6210059, 6.01867052]) V = callback_q.get()[-1] assert np.allclose(V[0, :], V_test) V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()]) max_q = np.array(callback_max_q.get()) assert np.allclose(V_max, max_q)
def test_sarsa_lambda_continuous_nn(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884, -0.12784956, -0.10572472, -0.14546978, -0.67001086, -0.93925357]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) w = agent.approximator.get_weights() w_test = np.array([-1.00749128, -1.13444655, -0.96620322]) assert np.allclose(w, w_test)
def experiment(algorithm_class, exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialParameter(value=1, exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.Q, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get() return reward, max_Qs
def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(COPDAC_Q.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_episodes_test, alg_params, policy_params): print(alg.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH 0') tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH ' + str(it + 1)) tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) print('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() print('============ start experiment ============') logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = GraspEnv() print('============ mdp ============') # Policy n_weights = 6 mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0]) sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1, 0.1]) #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4]) policy = Own_policy() dist = GaussianDiagonalDistribution( mu, sigma) # TODO: is this distribution right? Yes. agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset( ) # TODO: should we also collect the dataset? Just keep this. core = Core(agent, mdp, callbacks_fit=[dataset_callback]) #core = Core(agent, mdp) for i in range(n_epochs): print('================ core learn ================') core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) print('J:', J) print('============================') dataset_callback.clean() # Todo: learning curve? Done p = dist.get_parameters() print('p:', p) mu_0.append(p[:n_weights][0]) mu_1.append(p[:n_weights][1]) mu_2.append(p[:n_weights][2]) mu_3.append(p[:n_weights][3]) mu_4.append(p[:n_weights][4]) mu_5.append(p[:n_weights][5]) current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] + p[n_weights:][2] + p[n_weights:][3] + p[n_weights:][4] + p[n_weights:][5]) / 6 avg_sigma.append(current_avg_sigma) # record learning curve of cumulative rewards logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) list_J.append(np.mean(J))
def learn(alg, alg_params): # MDP mdp = CartPole() np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict( network=Network if alg is not CategoricalDQN else FeatureNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) # Agent if alg not in [DuelingDQN, CategoricalDQN]: agent = alg(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, **alg_params) elif alg is CategoricalDQN: agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=2, v_min=-1, v_max=1, **alg_params) else: agent = alg(mdp.info, pi, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=500, n_steps_per_fit=5) return agent
def learn(alg): mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = alg(mdp.info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau) # Algorithm core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent.policy
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_step_test, alg_params, policy_params): logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=64, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params['critic_params'] = critic_params policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(0, J=J, R=R, entropy=E) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(it + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def test_maxmin_q_learning(): pi, mdp, _ = initialize() agent = MaxminQLearning(mdp.info, pi, Parameter(.5), n_tables=4) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0., 0., 0., 0.], [0., 7.5, 0., 0.], [0., 0., 0., 5.], [0., 0., 0., 0.]]) assert np.allclose(agent.Q[0].table, test_q)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J))
def test_rq_learning(): pi, mdp, _ = initialize() agent = RQLearning(mdp.info, pi, Parameter(.1), beta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.32411217, 2.9698436, 0.46474438, 1.10269504], [2.99505139, 5.217031, 0.40933461, 0.37687883], [0.41942675, 0.32363486, 0., 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(mdp.info, pi, Parameter(.1), delta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[1.04081115e-2, 5.14662188e-1, 1.73951634e-2, 1.24081875e-01], [0., 2.71, 1.73137500e-4, 4.10062500e-6], [0., 4.50000000e-2, 0., 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(mdp.info, pi, Parameter(.1), off_policy=True, beta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[3.55204022, 4.54235939, 3.42601165, 2.95170908], [2.73877031, 3.439, 2.42031528, 2.86634531], [3.43274708, 3.8592342, 3.72637395, 5.217031], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(mdp.info, pi, Parameter(.1), off_policy=True, delta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.18947806, 1.57782254, 0.21911489, 1.05197011], [0.82309759, 5.217031, 0.04167492, 0.61472604], [0.23620541, 0.59828262, 1.25299991, 5.217031], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, mdp.info.gamma))) if save_states_to_disk: # save normalization / plot states to disk path os.makedirs("./temp/", exist_ok=True) prepro.save_state("./temp/normalization_state") plotter.save_state("./temp/plotting_state") # load states from disk path prepro.load_state("./temp/normalization_state") plotter.load_state("./temp/plotting_state")