Пример #1
0
def experiment(algorithm_class, exp):
    np.random.seed()

    # MDP
    p = np.load('chain_structure/p.npy')
    rew = np.load('chain_structure/rew.npy')
    mdp = FiniteMDP(p, rew, gamma=.9)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialParameter(value=1., exp=exp, size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(mdp.info, pi, **algorithm_params)

    # Algorithm
    collect_Q = CollectQ(agent.approximator)
    callbacks = [collect_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True)

    Qs = collect_Q.get()

    return Qs
Пример #2
0
def test_sarsa_lambda_continuous_nn_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f"))

    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)

    features = Features(
        n_outputs=mdp_continuous.info.observation_space.shape[0]
    )

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        network=Network,
        n_actions=mdp_continuous.info.action_space.n
    )
    agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent_save, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
Пример #3
0
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    agent = SARSALambdaContinuous(mdp_continuous.info,
                                  pi,
                                  LinearApproximator,
                                  Parameter(.1),
                                  .9,
                                  features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([
        -16.62627886, 0., -13.03033079, 0., -15.93237930, 0., -9.72299176, 0.,
        -13.78884631, 0., -9.92157645, 0.
    ])

    assert np.allclose(agent.Q.get_weights(), test_w)
Пример #4
0
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    agent = TrueOnlineSARSALambda(mdp_continuous.info,
                                  pi,
                                  Parameter(.1),
                                  .9,
                                  features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([
        -17.30427303, 0., -13.54157504, 0., -16.82373134, 0., -10.29613337, 0.,
        -14.79470382, 0., -10.50654665, 0.
    ])

    assert np.allclose(agent.Q.get_weights(), test_w)
Пример #5
0
def learn(alg, alg_params):
    mdp = CarOnHill()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Approximator
    approximator_params = dict(input_shape=mdp.info.observation_space.shape,
                               n_actions=mdp.info.action_space.n,
                               n_estimators=50,
                               min_samples_split=5,
                               min_samples_leaf=2)
    approximator = ExtraTreesRegressor

    # Agent
    agent = alg(mdp.info,
                pi,
                approximator,
                approximator_params=approximator_params,
                **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=5, n_episodes_per_fit=5)

    test_epsilon = Parameter(0.75)
    agent.policy.set_epsilon(test_epsilon)
    dataset = core.evaluate(n_episodes=2)

    return agent, np.mean(compute_J(dataset, mdp.info.gamma))
Пример #6
0
def learn(alg, alg_params):
    mdp = LQR.generate(dimensions=1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return policy
Пример #7
0
def learn_lspi():
    np.random.seed(1)

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_episodes_per_fit=10)

    return agent
Пример #8
0
def test_collect_Q():
    np.random.seed(88)
    mdp = GridWorld(3, 3, (2, 2))

    eps = Parameter(0.1)
    pi = EpsGreedy(eps)
    alpha = Parameter(0.1)
    agent = SARSA(mdp.info, pi, alpha)

    callback_q = CollectQ(agent.Q)
    callback_max_q = CollectMaxQ(agent.Q, np.array([2]))

    core = Core(agent, mdp, callbacks=[callback_q, callback_max_q])

    core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True)

    V_test = np.array([2.4477574, 0.02246188, 1.6210059, 6.01867052])
    V = callback_q.get()[-1]

    assert np.allclose(V[0, :], V_test)

    V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()])
    max_q = np.array(callback_max_q.get())

    assert np.allclose(V_max, max_q)
Пример #9
0
def learn(alg, alg_params):
    mdp = InvertedPendulum(horizon=50)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    policy_params = dict(std_0=1., use_cuda=False)

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    alg_params['critic_params'] = critic_params

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return agent
Пример #10
0
def test_lspi():
    np.random.seed(1)

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(mdp.info,
                 pi,
                 approximator_params=approximator_params,
                 fit_params=fit_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_episodes_per_fit=10)

    w = agent.approximator.get_weights()
    w_test = np.array([-1.00749128, -1.13444655, -0.96620322])

    assert np.allclose(w, w_test)
Пример #11
0
def test_true_online_sarsa_lambda_save(tmpdir):
    agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f"))

    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent_save, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    agent_save.save(agent_path)
    agent_load = Agent.load(agent_path)

    for att, method in vars(agent_save).items():
        save_attr = getattr(agent_save, att)
        load_attr = getattr(agent_load, att)

        tu.assert_eq(save_attr, load_attr)
Пример #12
0
def test_sarsa_lambda_continuous_nn():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)

    features = Features(
        n_outputs=mdp_continuous.info.observation_space.shape[0]
    )

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        network=Network,
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884,
                       -0.12784956, -0.10572472, -0.14546978, -0.67001086,
                       -0.93925357])

    assert np.allclose(agent.Q.get_weights(), test_w)
Пример #13
0
def experiment(algorithm_class, exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialParameter(value=1,
                                   exp=.5,
                                   size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialParameter(value=1, exp=exp, size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(mdp.info, pi, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.Q, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get()

    return reward, max_Qs
Пример #14
0
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = TrueOnlineSARSALambda(mdp_continuous.info, pi,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0.,
                       -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
Пример #15
0
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0.,
                       -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    logger = Logger('plot_and_norm_example', results_dir=None)
    logger.strong_line()
    logger.info('Plotting and normalization example')

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=.01)
    algorithm_params = dict(optimizer=optimizer)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        J = np.mean(compute_J(dataset, mdp.info.gamma))
        logger.epoch_info(n + 1, J=J)

    if save_states_to_disk:
        # save normalization / plot states to disk path
        logger.info('Saving plotting and normalization data')
        os.makedirs("./logs/plot_and_norm", exist_ok=True)
        prepro.save("./logs/plot_and_norm/preprocessor.msh")
        plotter.save_state("./logs/plot_and_norm/plotting_state")

        # load states from disk path
        logger.info('Loading preprocessor and plotter')
        prerpo = MinMaxPreprocessor.load(
            "./logs/plot_and_norm/preprocessor.msh")
        plotter.load_state("./logs/plot_and_norm/plotting_state")
Пример #17
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()
    print('============ start experiment ============')
    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = GraspEnv()
    print('============ mdp ============')

    # Policy
    n_weights = 6
    mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0])
    sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1,
                        0.1])  #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4])
    policy = Own_policy()
    dist = GaussianDiagonalDistribution(
        mu, sigma)  # TODO: is this distribution right? Yes.
    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset(
    )  # TODO: should we also collect the dataset? Just keep this.
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])
    #core = Core(agent, mdp)

    for i in range(n_epochs):
        print('================ core learn ================')
        core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit)

        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        print('J:', J)
        print('============================')
        dataset_callback.clean()  # Todo: learning curve? Done

        p = dist.get_parameters()
        print('p:', p)
        mu_0.append(p[:n_weights][0])
        mu_1.append(p[:n_weights][1])
        mu_2.append(p[:n_weights][2])
        mu_3.append(p[:n_weights][3])
        mu_4.append(p[:n_weights][4])
        mu_5.append(p[:n_weights][5])

        current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] +
                             p[n_weights:][2] + p[n_weights:][3] +
                             p[n_weights:][4] + p[n_weights:][5]) / 6
        avg_sigma.append(current_avg_sigma)

        # record learning curve of cumulative rewards
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])
        list_J.append(np.mean(J))
Пример #18
0
def learn(alg, alg_params):
    # MDP
    mdp = CartPole()
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(
        network=Network if alg is not CategoricalDQN else FeatureNetwork,
        optimizer={
            'class': optim.Adam,
            'params': {
                'lr': .001
            }
        },
        loss=F.smooth_l1_loss,
        input_shape=input_shape,
        output_shape=mdp.info.action_space.size,
        n_actions=mdp.info.action_space.n,
        n_features=2,
        use_cuda=False)

    # Agent
    if alg not in [DuelingDQN, CategoricalDQN]:
        agent = alg(mdp.info,
                    pi,
                    TorchApproximator,
                    approximator_params=approximator_params,
                    **alg_params)
    elif alg is CategoricalDQN:
        agent = alg(mdp.info,
                    pi,
                    approximator_params=approximator_params,
                    n_atoms=2,
                    v_min=-1,
                    v_max=1,
                    **alg_params)
    else:
        agent = alg(mdp.info,
                    pi,
                    approximator_params=approximator_params,
                    **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=500, n_steps_per_fit=5)

    return agent
Пример #19
0
 def show_agent(self, episodes=5, mdp_render=False):
     """
     Method to run and visualize the best builders in the environment.
     """
     matplotlib.use(default_backend)
     mdp = self.logger.load_environment_builder().build()
     if mdp_render:
         mdp.render()
     agent = self.logger.load_best_agent()
     core = Core(agent, mdp)
     core.evaluate(n_episodes=episodes, render=True)
Пример #20
0
def learn(alg):
    mdp = Gym('Pendulum-v0', 200, .99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    policy_class = OrnsteinUhlenbeckPolicy
    policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    batch_size = 200
    n_features = 80
    tau = .001

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_params = dict(network=ActorNetwork,
                        n_features=n_features,
                        input_shape=actor_input_shape,
                        output_shape=mdp.info.action_space.shape,
                        use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': .001
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = alg(mdp.info, policy_class, policy_params, actor_params,
                actor_optimizer, critic_params, batch_size,
                initial_replay_size, max_replay_size, tau)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return agent.policy
Пример #21
0
def test_dataset_utils():
    np.random.seed(88)

    mdp = GridWorld(3, 3, (2, 2))
    epsilon = Parameter(value=0.)
    alpha = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    agent = SARSA(mdp.info, pi, alpha)
    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=10)

    J = compute_J(dataset, mdp.info.gamma)
    J_test = np.array([
        1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01,
        1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00,
        2.28767925e+00, 4.23911583e-01
    ])
    assert np.allclose(J, J_test)

    L = episodes_length(dataset)
    L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31])
    assert np.array_equal(L, L_test)

    dataset_ep = select_first_episodes(dataset, 3)
    J = compute_J(dataset_ep, mdp.info.gamma)
    assert np.allclose(J, J_test[:3])

    L = episodes_length(dataset_ep)
    assert np.allclose(L, L_test[:3])

    samples = select_random_samples(dataset, 2)
    s, a, r, ss, ab, last = parse_dataset(samples)
    s_test = np.array([[6.], [1.]])
    a_test = np.array([[0.], [1.]])
    r_test = np.zeros(2)
    ss_test = np.array([[3], [4]])
    ab_test = np.zeros(2)
    last_test = np.zeros(2)
    assert np.array_equal(s, s_test)
    assert np.array_equal(a, a_test)
    assert np.array_equal(r, r_test)
    assert np.array_equal(ss, ss_test)
    assert np.array_equal(ab, ab_test)
    assert np.array_equal(last, last_test)

    index = np.sum(L_test[:2]) + L_test[2] // 2
    min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index],
                                                       mdp.info.gamma)
    assert min_J == 0.0011610630703530948
    assert max_J == 0.2781283894436937
    assert mean_J == 0.1396447262570234
    assert n_episodes == 2
Пример #22
0
def test_maxmin_q_learning():
    pi, mdp, _ = initialize()
    agent = MaxminQLearning(mdp.info, pi, Parameter(.5), n_tables=4)

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[0., 0., 0., 0.], [0., 7.5, 0., 0.], [0., 0., 0., 5.],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q[0].table, test_q)
Пример #23
0
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        print('Epoch: ', n, '  J: ', np.mean(compute_J(dataset,
                                                       mdp.info.gamma)))

    if save_states_to_disk:
        # save normalization / plot states to disk path
        os.makedirs("./temp/", exist_ok=True)
        prepro.save_state("./temp/normalization_state")
        plotter.save_state("./temp/plotting_state")

        # load states from disk path
        prepro.load_state("./temp/normalization_state")
        plotter.load_state("./temp/plotting_state")
Пример #24
0
def test_weighted_q_learning():
    pi, mdp, _ = initialize()
    agent = WeightedQLearning(mdp.info, pi, Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[7.1592415, 4.07094744, 7.10518702, 8.5467274],
                       [8.08689916, 9.99023438, 5.77871216, 7.51059129],
                       [6.52294537, 0.86087671, 3.70431496, 9.6875],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Пример #25
0
def test_q_learning():
    pi, mdp, _ = initialize()
    agent = QLearning(mdp.info, pi, Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[7.82042542, 8.40151978, 7.64961548, 8.82421875],
                       [8.77587891, 9.921875, 7.29316406, 8.68359375],
                       [7.7203125, 7.69921875, 4.5, 9.84375],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Пример #26
0
def test_r_learning():
    pi, mdp, _ = initialize()
    agent = RLearning(mdp.info, pi, Parameter(.1), Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781],
                       [-2.52319391, 1.92201829, -2.77602918, -2.45972955],
                       [-5.38824415, -2.43019918, -1.09965936, 2.04202511],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Пример #27
0
def test_expected_sarsa():
    pi, mdp, _ = initialize()
    agent = ExpectedSARSA(mdp.info, pi, Parameter(.1))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[0.10221208, 0.48411449, 0.07688765, 0.64002317],
                       [0.58525881, 5.217031, 0.06047094, 0.48214145],
                       [0.08478224, 0.28873536, 0.06543094, 4.68559],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Пример #28
0
def test_weighted_q_learning():
    pi, mdp, _ = initialize()
    agent = WeightedQLearning(mdp.info, pi, Parameter(.5))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[8.00815525, 4.09343205, 7.94406811, 8.96270031],
                       [8.31597686, 9.99023438, 6.42921521, 7.70471909],
                       [7.26069091, 0.87610663, 3.70440836, 9.6875],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Пример #29
0
def test_sarsa():
    pi, mdp, _ = initialize()
    agent = SARSA(mdp.info, pi, Parameter(.1))

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[4.31368701e-2, 3.68037689e-1, 4.14040445e-2, 1.64007642e-1],
                       [6.45491436e-1, 4.68559000, 8.07603735e-2, 1.67297938e-1],
                       [4.21445838e-2, 3.71538042e-3, 0., 3.439],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)
Пример #30
0
def test_sarsa_lambda_discrete():
    pi, mdp, _ = initialize()
    agent = SARSALambda(mdp.info, pi, Parameter(.1), .9)

    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_q = np.array([[1.88093529, 2.42467354, 1.07390687, 2.39288988],
                       [2.46058746, 4.68559, 1.5661933, 2.56586018],
                       [1.24808966, 0.91948465, 0.47734152, 3.439],
                       [0., 0., 0., 0.]])

    assert np.allclose(agent.Q.table, test_q)