def experiment(algorithm_class, exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialParameter(value=1., exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get() return Qs
def test_sarsa_lambda_continuous_nn_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -16.62627886, 0., -13.03033079, 0., -15.93237930, 0., -9.72299176, 0., -13.78884631, 0., -9.92157645, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -17.30427303, 0., -13.54157504, 0., -16.82373134, 0., -10.29613337, 0., -14.79470382, 0., -10.50654665, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return agent, np.mean(compute_J(dataset, mdp.info.gamma))
def learn(alg, alg_params): mdp = LQR.generate(dimensions=1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return policy
def learn_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) return agent
def test_collect_Q(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(mdp.info, pi, alpha) callback_q = CollectQ(agent.Q) callback_max_q = CollectMaxQ(agent.Q, np.array([2])) core = Core(agent, mdp, callbacks=[callback_q, callback_max_q]) core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True) V_test = np.array([2.4477574, 0.02246188, 1.6210059, 6.01867052]) V = callback_q.get()[-1] assert np.allclose(V[0, :], V_test) V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()]) max_q = np.array(callback_max_q.get()) assert np.allclose(V_max, max_q)
def learn(alg, alg_params): mdp = InvertedPendulum(horizon=50) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return agent
def test_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) w = agent.approximator.get_weights() w_test = np.array([-1.00749128, -1.13444655, -0.96620322]) assert np.allclose(w, w_test)
def test_true_online_sarsa_lambda_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def test_sarsa_lambda_continuous_nn(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884, -0.12784956, -0.10572472, -0.14546978, -0.67001086, -0.93925357]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(algorithm_class, exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialParameter(value=1, exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.Q, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get() return reward, max_Qs
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0., -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0., -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() print('============ start experiment ============') logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = GraspEnv() print('============ mdp ============') # Policy n_weights = 6 mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0]) sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1, 0.1]) #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4]) policy = Own_policy() dist = GaussianDiagonalDistribution( mu, sigma) # TODO: is this distribution right? Yes. agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset( ) # TODO: should we also collect the dataset? Just keep this. core = Core(agent, mdp, callbacks_fit=[dataset_callback]) #core = Core(agent, mdp) for i in range(n_epochs): print('================ core learn ================') core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) print('J:', J) print('============================') dataset_callback.clean() # Todo: learning curve? Done p = dist.get_parameters() print('p:', p) mu_0.append(p[:n_weights][0]) mu_1.append(p[:n_weights][1]) mu_2.append(p[:n_weights][2]) mu_3.append(p[:n_weights][3]) mu_4.append(p[:n_weights][4]) mu_5.append(p[:n_weights][5]) current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] + p[n_weights:][2] + p[n_weights:][3] + p[n_weights:][4] + p[n_weights:][5]) / 6 avg_sigma.append(current_avg_sigma) # record learning curve of cumulative rewards logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) list_J.append(np.mean(J))
def learn(alg, alg_params): # MDP mdp = CartPole() np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict( network=Network if alg is not CategoricalDQN else FeatureNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) # Agent if alg not in [DuelingDQN, CategoricalDQN]: agent = alg(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, **alg_params) elif alg is CategoricalDQN: agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=2, v_min=-1, v_max=1, **alg_params) else: agent = alg(mdp.info, pi, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=500, n_steps_per_fit=5) return agent
def show_agent(self, episodes=5, mdp_render=False): """ Method to run and visualize the best builders in the environment. """ matplotlib.use(default_backend) mdp = self.logger.load_environment_builder().build() if mdp_render: mdp.render() agent = self.logger.load_best_agent() core = Core(agent, mdp) core.evaluate(n_episodes=episodes, render=True)
def learn(alg): mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = alg(mdp.info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau) # Algorithm core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent.policy
def test_dataset_utils(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) epsilon = Parameter(value=0.) alpha = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) agent = SARSA(mdp.info, pi, alpha) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=10) J = compute_J(dataset, mdp.info.gamma) J_test = np.array([ 1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01, 1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00, 2.28767925e+00, 4.23911583e-01 ]) assert np.allclose(J, J_test) L = episodes_length(dataset) L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31]) assert np.array_equal(L, L_test) dataset_ep = select_first_episodes(dataset, 3) J = compute_J(dataset_ep, mdp.info.gamma) assert np.allclose(J, J_test[:3]) L = episodes_length(dataset_ep) assert np.allclose(L, L_test[:3]) samples = select_random_samples(dataset, 2) s, a, r, ss, ab, last = parse_dataset(samples) s_test = np.array([[6.], [1.]]) a_test = np.array([[0.], [1.]]) r_test = np.zeros(2) ss_test = np.array([[3], [4]]) ab_test = np.zeros(2) last_test = np.zeros(2) assert np.array_equal(s, s_test) assert np.array_equal(a, a_test) assert np.array_equal(r, r_test) assert np.array_equal(ss, ss_test) assert np.array_equal(ab, ab_test) assert np.array_equal(last, last_test) index = np.sum(L_test[:2]) + L_test[2] // 2 min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma) assert min_J == 0.0011610630703530948 assert max_J == 0.2781283894436937 assert mean_J == 0.1396447262570234 assert n_episodes == 2
def test_maxmin_q_learning(): pi, mdp, _ = initialize() agent = MaxminQLearning(mdp.info, pi, Parameter(.5), n_tables=4) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0., 0., 0., 0.], [0., 7.5, 0., 0.], [0., 0., 0., 5.], [0., 0., 0., 0.]]) assert np.allclose(agent.Q[0].table, test_q)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, mdp.info.gamma))) if save_states_to_disk: # save normalization / plot states to disk path os.makedirs("./temp/", exist_ok=True) prepro.save_state("./temp/normalization_state") plotter.save_state("./temp/plotting_state") # load states from disk path prepro.load_state("./temp/normalization_state") plotter.load_state("./temp/plotting_state")
def test_weighted_q_learning(): pi, mdp, _ = initialize() agent = WeightedQLearning(mdp.info, pi, Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[7.1592415, 4.07094744, 7.10518702, 8.5467274], [8.08689916, 9.99023438, 5.77871216, 7.51059129], [6.52294537, 0.86087671, 3.70431496, 9.6875], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_q_learning(): pi, mdp, _ = initialize() agent = QLearning(mdp.info, pi, Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[7.82042542, 8.40151978, 7.64961548, 8.82421875], [8.77587891, 9.921875, 7.29316406, 8.68359375], [7.7203125, 7.69921875, 4.5, 9.84375], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_r_learning(): pi, mdp, _ = initialize() agent = RLearning(mdp.info, pi, Parameter(.1), Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781], [-2.52319391, 1.92201829, -2.77602918, -2.45972955], [-5.38824415, -2.43019918, -1.09965936, 2.04202511], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_expected_sarsa(): pi, mdp, _ = initialize() agent = ExpectedSARSA(mdp.info, pi, Parameter(.1)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.10221208, 0.48411449, 0.07688765, 0.64002317], [0.58525881, 5.217031, 0.06047094, 0.48214145], [0.08478224, 0.28873536, 0.06543094, 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_weighted_q_learning(): pi, mdp, _ = initialize() agent = WeightedQLearning(mdp.info, pi, Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[8.00815525, 4.09343205, 7.94406811, 8.96270031], [8.31597686, 9.99023438, 6.42921521, 7.70471909], [7.26069091, 0.87610663, 3.70440836, 9.6875], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_sarsa(): pi, mdp, _ = initialize() agent = SARSA(mdp.info, pi, Parameter(.1)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[4.31368701e-2, 3.68037689e-1, 4.14040445e-2, 1.64007642e-1], [6.45491436e-1, 4.68559000, 8.07603735e-2, 1.67297938e-1], [4.21445838e-2, 3.71538042e-3, 0., 3.439], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_sarsa_lambda_discrete(): pi, mdp, _ = initialize() agent = SARSALambda(mdp.info, pi, Parameter(.1), .9) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[1.88093529, 2.42467354, 1.07390687, 2.39288988], [2.46058746, 4.68559, 1.5661933, 2.56586018], [1.24808966, 0.91948465, 0.47734152, 3.439], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)