Exemplo n.º 1
0
def test_sarsa_lambda_continuous_linear():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = SARSALambdaContinuous(LinearApproximator, pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0.,
                       -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
def build_low_level_ghavamzadeh(alg, params, mdp):
    # FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    input_shape = (featuresL.size, )
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    std = np.array([3e-2])
    pi = DiagonalGaussianPolicy(mu=approximator, std=std)

    agent = alg(pi, mdp_info_agentL, features=featuresL, **params)

    return agent
Exemplo n.º 3
0
def test_true_online_sarsa_lambda():
    pi, _, mdp_continuous = initialize()
    mdp_continuous.seed(1)
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size,),
        output_shape=(mdp_continuous.info.action_space.n,),
        n_actions=mdp_continuous.info.action_space.n
    )
    agent = TrueOnlineSARSALambda(pi, mdp_continuous.info,
                                  Parameter(.1), .9, features=features,
                                  approximator_params=approximator_params)

    core = Core(agent, mdp_continuous)

    # Train
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0.,
                       -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.])

    assert np.allclose(agent.Q.get_weights(), test_w)
Exemplo n.º 4
0
def test_sarsa_lambda_continuous():
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    alg = SARSALambdaContinuous(LinearApproximator,
                                pi,
                                mdp_continuous.info,
                                Parameter(.1),
                                .9,
                                features=features,
                                approximator_params=approximator_params)

    s_1 = np.linspace(mdp_continuous.info.observation_space.low[0],
                      mdp_continuous.info.observation_space.high[0], 10)
    s_2 = np.linspace(mdp_continuous.info.observation_space.low[1],
                      mdp_continuous.info.observation_space.high[1], 10)
    for i in s_1:
        for j in s_2:
            alg._update(np.array([i, j]), np.array([1]), 100, np.array([0, 0]),
                        0)

    test_w = np.array([[0, 0, 0, 0], [320.43, 399.8616, 340.397, 417.218],
                       [0, 0, 0, 0]])

    assert np.allclose(alg.Q.get_weights(), test_w.ravel())
Exemplo n.º 5
0
def test_true_online_sarsa_lambda():
    n_tilings = 1
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp_continuous.info.observation_space.low,
                             mdp_continuous.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(
        input_shape=(features.size, ),
        output_shape=(mdp_continuous.info.action_space.n, ),
        n_actions=mdp_continuous.info.action_space.n)
    alg = TrueOnlineSARSALambda(pi,
                                mdp_continuous.info,
                                Parameter(.1),
                                .9,
                                features=features,
                                approximator_params=approximator_params)

    s_1 = np.linspace(mdp_continuous.info.observation_space.low[0],
                      mdp_continuous.info.observation_space.high[0], 10)
    s_2 = np.linspace(mdp_continuous.info.observation_space.low[1],
                      mdp_continuous.info.observation_space.high[1], 10)
    for i in s_1:
        for j in s_2:
            alg._update(np.array([i, j]), np.array([1]), 100, np.array([0, 0]),
                        0)

    test_w = np.array([[0, 0, 0, 0], [927.0283, 798.57594, 876.8018, 705.227],
                       [0, 0, 0, 0]])

    assert np.allclose(alg.Q.get_weights(), test_w.ravel())
def experiment(alg, params, experiment_params ,subdir, i):

    np.random.seed()

    # MDP
    mdp = ShipSteering(small=True, n_steps_action=3)

    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size,)

    approximator_params = dict(input_dim=phi.size)
    approximator = Regressor(LinearApproximator, input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    #sigma = np.array([[1e-4]])
    std = np.array([3e-2])
    policy = DiagonalGaussianPolicy(mu=approximator, std=std)
    #policy = GaussianPolicy(mu=approximator, sigma=sigma)

    # Agent
    agent = alg(policy, mdp.info, features=phi, **params)

    # Train
    parameter_dataset = CollectPolicyParameter(policy)
    core = Core(agent, mdp, callbacks=[parameter_dataset])


    dataset_eval = list()
    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    # print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    dataset_eval += dataset_eval_run
    print('J at start : ' + str(np.mean(J)))

    for n in range(n_runs):
        print('ITERATION    :', n)
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

    mk_dir_recursive('./' + subdir + str(i))
    np.save(subdir+str(i)+'/dataset_eval_file', dataset_eval)
    np.save(subdir+str(i)+'/parameter_dataset_file', parameter_dataset)
Exemplo n.º 7
0
 def __init__(self, low, high, n_tiles, name='discretizaation_block'):
     self.low = low * np.ones(shape=np.shape(n_tiles))
     self.high = high * np.ones(shape=np.shape(n_tiles))
     self.n_tiles = n_tiles
     self.tilings = Tiles.generate(n_tilings=1,
                                   n_tiles=self.n_tiles,
                                   low=self.low,
                                   high=self.high)
     super(DiscretizationBlock, self).__init__(name=name)
Exemplo n.º 8
0
def experiment(alg, params, subdir, exp_no):
    np.random.seed()

    # MDP
    mdp = ShipSteering(small=True, n_steps_action=3)

    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)
    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    approximator_params = dict(input_dim=input_shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    # Train
    dataset_eval = list()
    core = Core(agent, mdp)
    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    #print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

    mk_dir_recursive('./' + subdir + str(exp_no))
    np.save(subdir + str(exp_no) + '/dataset_eval_file', dataset_eval)
Exemplo n.º 9
0
    def reset(self, inputs):
        tilings = Tiles.generate(n_tilings=1,
                                 n_tiles=self.n_tiles,
                                 low=self.low,
                                 high=self.high)
        state = np.concatenate(inputs, axis=0)

        index = list()
        for tile in tilings:
            index.append(tile(state))
        self.last_output = index
Exemplo n.º 10
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high,
                                     phi, phi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps / n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
Exemplo n.º 11
0
def experiment(alg, params, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    # Train
    print(alg.__name__)
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def test_sac():
    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)
    agent = SAC(policy, mdp.info, alpha_theta, alpha_v, lambda_par=.5,
                value_function_features=psi, policy_features=phi)
    agent.fit(dataset)

    w_1 = .09370429
    w_2 = .28141735

    w = agent._V.get_weights()

    assert np.allclose(w_1, w[65])
    assert np.allclose(w_2, w[78])
Exemplo n.º 13
0
def experiment(n_epochs, n_steps, n_eval_episodes):
    np.random.seed()

    # MDP
    mdp = InvertedPendulum()

    # Agent
    n_tilings = 10
    alpha_theta = ExponentialDecayParameter(1, decay_exp=1.0)
    alpha_omega = ExponentialDecayParameter(1.5 / n_tilings, decay_exp=2 / 3)
    alpha_v = ExponentialDecayParameter(1 / n_tilings, decay_exp=2 / 3)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-3 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy,
                     mu,
                     mdp.info,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=n_eval_episodes)
    J = compute_J(dataset_eval, gamma=1.0)
    print('Total Reward per episode at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset_eval = core.evaluate(n_episodes=n_eval_episodes, render=False)
        J = compute_J(dataset_eval, gamma=1.0)
        print('Total Reward per episode at iteration ' + str(i) + ': ' +
              str(np.mean(J)))
Exemplo n.º 14
0
def experiment(alg, params, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size,)

    approximator = Regressor(LinearApproximator, input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(distribution, policy, mdp.info, features=phi, **params)

    # Train
    print(alg.__name__)
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    for i in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
Exemplo n.º 15
0
def build_high_level_agent(alg, params, mdp, mu, std):
    tilings = Tiles.generate(n_tilings=1,
                             n_tiles=[10, 10],
                             low=mdp.info.observation_space.low[:2],
                             high=mdp.info.observation_space.high[:2])
    features = Features(tilings=tilings)

    input_shape = (features.size, )

    mu_approximator = Regressor(LinearApproximator,
                                input_shape=input_shape,
                                output_shape=(1, ))
    std_approximator = Regressor(LinearApproximator,
                                 input_shape=input_shape,
                                 output_shape=(1, ))

    w_mu = mu * np.ones(mu_approximator.weights_size)
    mu_approximator.set_weights(w_mu)

    w_std = std * np.ones(std_approximator.weights_size)
    mu_approximator.set_weights(w_std)

    pi = StateLogStdGaussianPolicy(mu=mu_approximator,
                                   log_std=std_approximator)

    obs_low = np.array(
        [mdp.info.observation_space.low[0], mdp.info.observation_space.low[1]])
    obs_high = np.array([
        mdp.info.observation_space.high[0], mdp.info.observation_space.high[1]
    ])
    mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(obs_low,
                                                           obs_high,
                                                           shape=(2, )),
                              action_space=spaces.Box(
                                  mdp.info.observation_space.low[2],
                                  mdp.info.observation_space.high[2],
                                  shape=(1, )),
                              gamma=1,
                              horizon=10)
    agent = alg(policy=pi,
                mdp_info=mdp_info_agent1,
                features=features,
                **params)

    return agent
def test_sac_avg():
    alpha_r = Parameter(.0001)
    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)
    agent = SAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r,
                    lambda_par=.5, value_function_features=psi,
                    policy_features=phi)
    agent.fit(dataset)

    w_1 = .09645764
    w_2 = .28583057

    w = agent._V.get_weights()

    assert np.allclose(w_1, w[65])
    assert np.allclose(w_2, w[78])
Exemplo n.º 17
0
def experiment(alpha):
    gym.logger.setLevel(0)
    np.random.seed(386)

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=10000, gamma=1.)
    mdp.seed(201)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size,),
                               output_shape=(mdp.info.action_space.n,),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate,
                        'lambda': .9}
    fit_params = dict()
    agent_params = {'approximator_params': approximator_params,
                    'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    initial_states = np.array([[0., 0.], [.1, .1]])
    dataset = core.evaluate(initial_states=initial_states, quiet=True)

    return np.mean(compute_J(dataset, 1.))
Exemplo n.º 18
0
def tiles():
    tilings = Tiles.generate(3, [3, 3], np.array([0., -.5]), np.array([1.,
                                                                       .5]))
    features = Features(tilings=tilings)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.all(features(x_i) == y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.all(features(x_1, x_2) == y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.all(features(x_i[0], x_i[1]) == y[i])
Exemplo n.º 19
0
def build_approximator(mdp):
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 8]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                             high=high, uniform=True)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    approximator = Regressor(LinearApproximator, input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    return phi, approximator
Exemplo n.º 20
0
def test_tiles():
    tilings = Tiles.generate(3, [3, 3],
                             np.array([0., -.5]),
                             np.array([1., .5]))
    features = Features(tilings=tilings)

    x = np.random.rand(10, 2) + [0., -.5]

    y = features(x)

    for i, x_i in enumerate(x):
        assert np.all(features(x_i) == y[i])

    x_1 = x[:, 0].reshape(-1, 1)
    x_2 = x[:, 1].reshape(-1, 1)

    assert np.all(features(x_1, x_2) == y)

    for i, x_i in enumerate(zip(x_1, x_2)):
        assert np.all(features(x_i[0], x_i[1]) == y[i])
Exemplo n.º 21
0
def test_copdac_q():
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 1
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [2, 2],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(policy, mu, mdp.info,
                     alpha_theta, alpha_omega, alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    w = agent.policy.get_weights()
    w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2])

    assert np.allclose(w, w_test)
Exemplo n.º 22
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = Parameter(alpha)
    tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
    fit_params = dict()
    agent_params = {
        'approximator_params': approximator_params,
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features)

    # Algorithm
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset]
    core = Core(agent, mdp, callbacks=callbacks)

    # Train
    core.learn(n_episodes=20, n_steps_per_fit=1, render=0)

    dataset = collect_dataset.get()
    return np.mean(compute_J(dataset, 1.))
Exemplo n.º 23
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(pi,
                                  mdp.info,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=40, n_steps_per_fit=1, render=False)
    dataset = core.evaluate(n_episodes=1, render=True)

    return np.mean(compute_J(dataset, 1.))
Exemplo n.º 24
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=0.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size,),
                               output_shape=(mdp.info.action_space.n,),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate,
                        'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(pi, mdp.info,
                                  approximator_params=approximator_params,
                                  features=features, **algorithm_params)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=40, n_steps_per_fit=1, render=False)
    dataset = core.evaluate(n_episodes=1, render=True)

    return np.mean(compute_J(dataset, 1.))
Exemplo n.º 25
0
def learn(alg):
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 2
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings - 1, [1, 1],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator,
                    input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    if alg is StochasticAC:
        agent = alg(policy,
                    mdp.info,
                    alpha_theta,
                    alpha_v,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)
    elif alg is StochasticAC_AVG:
        agent = alg(policy,
                    mdp.info,
                    alpha_theta,
                    alpha_v,
                    alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return policy
Exemplo n.º 26
0
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = 1000 + 1e-8
    n_tiles_high = [20, 20]
    n_tilings = 1

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # PolicyH
    epsilon = Parameter(value=0.1)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = params.get('learning_rate_high')

    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(
        n_tiles_high[0] * n_tiles_high[1]),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)

    agentH = alg_high(policy=piH,
                      mdp_info=mdp_info_agentH,
                      learning_rate=learning_rate,
                      lambda_coeff=0.9)

    epsilon_update = EpsilonUpdate(piH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    # Approximators
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)

    # Policy1
    std1 = np.array([3e-2])
    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1)

    # Policy2
    std2 = np.array([3e-2])
    pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2)

    # Agent1
    learning_rate1 = params.get('learning_rate_low')
    agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = params.get('learning_rate_low')
    agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir='+')
    termination_condition2 = TerminationCondition(active_dir='x')

    low_ep_per_fit = params.get('low_ep_per_fit')

    # Control Block +
    control_block_plus = ControlBlock(
        name='control block 1',
        agent=agent1,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition1)

    # Control Block x
    control_block_cross = ControlBlock(
        name='control block 2',
        agent=agent2,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate, name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low, name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    # Selector Block
    function_block8 = fBlock(phi=selector_function, name='f7 G_lo')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block_plus])
    mux_block.add_block_list([control_block_cross])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, function_block8,
        reward_acc_H, discretization_block
    ]

    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block_plus)
    reward_acc_H.add_alarm_connection(control_block_cross)

    control_blockH.add_input(discretization_block)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block_plus)
    control_blockH.add_alarm_connection(control_block_cross)

    mux_block.add_input(function_block8)
    mux_block.add_input(function_block2)

    control_block_plus.add_reward(function_block5)
    control_block_cross.add_reward(function_block5)

    function_block1.add_input(state_ph)

    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)

    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_plus)
    function_block3.add_alarm_connection(control_block_cross)

    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)

    function_block5.add_input(function_block7)

    function_block6.add_input(reward_ph)

    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    function_block8.add_input(control_blockH)

    discretization_block.add_input(function_block1)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    # print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    dataset_eval += dataset_eval_run
    print('J at start : ' + str(np.mean(J)))
    for n in range(n_runs):
        print('ITERATION', n)

        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

        dataset_plus = control_block_plus.dataset.get()
        J_plus = compute_J(dataset_plus, mdp.info.gamma)
        dataset_cross = control_block_cross.dataset.get()
        J_cross = compute_J(dataset_cross, mdp.info.gamma)

        low_level_dataset_eval1.append(dataset_plus)
        low_level_dataset_eval2.append(dataset_cross)

        print('J ll PLUS at iteration  ' + str(n) + ': ' +
              str(np.mean(J_plus)))
        print('J ll CROSS at iteration ' + str(n) + ': ' +
              str(np.mean(J_cross)))
        if n == 4:
            control_blockH.callbacks = [epsilon_update]

    # Tile data
    hi_lev_params = agentH.Q.table
    max_q_val = np.zeros(n_tiles_high[0]**2)
    act_max_q_val = np.zeros(n_tiles_high[0]**2)
    for n in range(n_tiles_high[0]**2):
        max_q_val[n] = np.amax(hi_lev_params[n])
        act_max_q_val[n] = np.argmax(hi_lev_params[n])

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset1_file',
            low_level_dataset_eval1)
    np.save(subdir + str(i) + '/low_level_dataset2_file',
            low_level_dataset_eval2)
    np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val)
    np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)

    return
Exemplo n.º 27
0
from mushroom.features import Features
from mushroom.features.tiles import Tiles
from mushroom.policy import EpsGreedy
from mushroom.utils.callbacks import CollectDataset
from mushroom.utils.parameters import Parameter

# MDP
mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

# Policy
epsilon = Parameter(value=0.)
pi = EpsGreedy(epsilon=epsilon)

# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

approximator_params = dict(input_shape=(features.size, ),
                           output_shape=(mdp.info.action_space.n, ),
                           n_actions=mdp.info.action_space.n)

# Agent
learning_rate = Parameter(.1 / n_tilings)
algorithm_params = {'learning_rate': learning_rate, 'lambda': .9}
fit_params = dict()
agent_params = {
    'approximator_params': approximator_params,
    'algorithm_params': algorithm_params,
    'fit_params': fit_params
}
Exemplo n.º 28
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = SAC_AVG(policy, mdp.info,
                    alpha_theta, alpha_v, alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
Exemplo n.º 29
0
def experiment():

    small = True

    print('ENV IS SMALL? ', small)
    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=small, hard=True, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    #FeaturesH
    lim = 150 if small else 1000

    tilingsH = Tiles.generate(n_tilings=1,
                              n_tiles=[5, 5],
                              low=[0, 0],
                              high=[lim, lim])
    featuresH = Features(tilings=tilingsH)

    # PolicyH
    epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = Parameter(value=1)

    mdp_info_agentH = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)
    approximator_paramsH = dict(input_shape=(featuresH.size, ),
                                output_shape=mdp_info_agentH.action_space.size,
                                n_actions=mdp_info_agentH.action_space.n)

    agentH = TrueOnlineSARSALambda(policy=piH,
                                   mdp_info=mdp_info_agentH,
                                   learning_rate=learning_rate,
                                   lambda_coeff=0.9,
                                   approximator_params=approximator_paramsH,
                                   features=featuresH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    featuresL = Features(basis_list=[PolynomialBasis()])

    # Policy1
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             **approximator_params)
    sigma = np.array([[1.3e-2]])
    pi1 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Policy2
    pi2 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Agent1
    learning_rate1 = AdaptiveParameter(value=1e-5)
    agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = AdaptiveParameter(value=1e-5)
    agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir=1, small=small)
    termination_condition2 = TerminationCondition(active_dir=5, small=small)

    # Control Block +
    control_block1 = ControlBlock(name='control block 1',
                                  agent=agent1,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition1)

    # Control Block x
    control_block2 = ControlBlock(name='control block 2',
                                  agent=agent2,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block1])
    mux_block.add_block_list([control_block2])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, reward_acc_H
    ]

    #state_ph.add_input(mux_block)
    #reward_ph.add_input(mux_block)
    #lastaction_ph.add_input(mux_block)
    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block1)
    reward_acc_H.add_alarm_connection(control_block2)
    control_blockH.add_input(function_block1)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block1)
    control_blockH.add_alarm_connection(control_block2)
    mux_block.add_input(control_blockH)
    mux_block.add_input(function_block2)
    control_block1.add_reward(function_block5)
    control_block2.add_reward(function_block5)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)
    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block1)
    function_block3.add_alarm_connection(control_block2)
    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)
    function_block5.add_input(reward_ph)
    function_block5.add_input(function_block7)
    function_block6.add_input(reward_ph)
    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_visual = list()
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()

    n_runs = 5
    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=1000, skip=True)
        dataset_eval = core.evaluate(n_episodes=10)
        last_ep_dataset = pick_last_ep(dataset_eval)
        dataset_eval_visual += last_ep_dataset
        low_level_dataset_eval1 += control_block1.dataset.get()
        low_level_dataset_eval2 += control_block2.dataset.get()

    # Visualize
    hi_lev_params = agentH.Q.get_weights()
    hi_lev_params = np.reshape(hi_lev_params, (8, 25))
    max_q_val = np.zeros(shape=(25, ))
    act_max_q_val = np.zeros(shape=(25, ))
    for i in range(25):
        max_q_val[i] = np.amax(hi_lev_params[:, i])
        act_max_q_val[i] = np.argmax(hi_lev_params[:, i])
    max_q_val_tiled = np.reshape(max_q_val, (5, 5))
    act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5))
    #low_level_dataset1 = dataset_callback1.get()
    #low_level_dataset2 = dataset_callback2.get()

    subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/'
    mk_dir_recursive('./' + subdir)

    np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1)
    np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2)
    np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled)
    np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled)
    np.save(subdir + '/dataset_eval_file', dataset_eval_visual)

    return
Exemplo n.º 30
0
from mushroom.environments import ShipSteering
from mushroom.features import Features
from mushroom.features.tiles import Tiles
from mushroom.policy import DeterministicPolicy
from mushroom.utils.parameters import AdaptiveParameter

mdp = ShipSteering()

high = [150, 150, np.pi]
low = [0, 0, -np.pi]
n_tiles = [5, 5, 6]
low = np.array(low, dtype=np.float)
high = np.array(high, dtype=np.float)
n_tilings = 1

tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low,
                         high=high)

phi = Features(tilings=tilings)
input_shape = (phi.size,)

approximator = Regressor(LinearApproximator, input_shape=input_shape,
                         output_shape=mdp.info.action_space.shape)

policy = DeterministicPolicy(approximator)

mu = np.zeros(policy.weights_size)
sigma = 4e-1 * np.ones(policy.weights_size)
distribution_test = GaussianDiagonalDistribution(mu, sigma)
agent_test = RWR(distribution_test, policy, mdp.info, beta=1.)
core = Core(agent_test, mdp)
Exemplo n.º 31
0
from mushroom.policy import EpsGreedy
from mushroom.utils.callbacks import CollectDataset
from mushroom.utils.parameters import Parameter


# MDP
mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

# Policy
epsilon = Parameter(value=0.)
pi = EpsGreedy(epsilon=epsilon)

# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10],
                         mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

approximator_params = dict(input_shape=(features.size,),
                           output_shape=(mdp.info.action_space.n,),
                           n_actions=mdp.info.action_space.n)

# Agent
learning_rate = Parameter(.1 / n_tilings)

agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info,
                              approximator_params=approximator_params,
                              learning_rate=learning_rate,
                              lambda_coeff= .9, features=features)