Пример #1
0
def learn(alg, alg_params):
    mdp = Gym('Pendulum-v0', 200, .99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    policy_params = dict(std_0=1., use_cuda=False)

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    alg_params['critic_params'] = critic_params

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return agent
Пример #2
0
def test_a2c():
    mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    policy_params = dict(std_0=1., n_features=64, use_cuda=False)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    algorithm_params = dict(critic_params=critic_params,
                            actor_optimizer={
                                'class': optim.RMSprop,
                                'params': {
                                    'lr': 7e-4,
                                    'eps': 3e-3
                                }
                            },
                            max_grad_norm=0.5,
                            ent_coeff=0.01)

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = A2C(mdp.info, policy, **algorithm_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    w = agent.policy.get_weights()
    w_test = np.array(
        [-1.6307759, 1.0356185, -0.34508315, 0.27108294, -0.01047843])

    assert np.allclose(w, w_test)
Пример #3
0
def learn(alg, alg_params):
    class Network(nn.Module):
        def __init__(self, input_shape, output_shape, **kwargs):
            super(Network, self).__init__()

            n_input = input_shape[-1]
            n_output = output_shape[0]

            self._h = nn.Linear(n_input, n_output)

            nn.init.xavier_uniform_(self._h.weight,
                                    gain=nn.init.calculate_gain('relu'))

        def forward(self, state, **kwargs):
            return F.relu(self._h(torch.squeeze(state, 1).float()))

    mdp = Gym('Pendulum-v0', 200, .99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    critic_params = dict(network=Network,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': 3e-4}},
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1,))

    policy_params = dict(std_0=1., use_cuda=False)

    policy = GaussianTorchPolicy(Network,
                                 mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape,
                                 **policy_params)

    alg_params['critic_params'] = critic_params

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return policy
Пример #4
0
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
               n_episodes_test, alg_params, policy_params):
    print(alg.__name__)

    mdp = Gym(env_id, horizon, gamma)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=32,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    alg_params['critic_params'] = critic_params

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy()

    tqdm.write('END OF EPOCH 0')
    tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E))
    tqdm.write(
        '##################################################################################################'
    )

    for it in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy()

        tqdm.write('END OF EPOCH ' + str(it + 1))
        tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E))
        tqdm.write(
            '##################################################################################################'
        )

    print('Press a button to visualize')
    input()
    core.evaluate(n_episodes=5, render=True)
Пример #5
0
def learn_a2c():
    mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    policy_params = dict(std_0=1., n_features=64, use_cuda=False)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    algorithm_params = dict(critic_params=critic_params,
                            actor_optimizer={
                                'class': optim.RMSprop,
                                'params': {
                                    'lr': 7e-4,
                                    'eps': 3e-3
                                }
                            },
                            max_grad_norm=0.5,
                            ent_coeff=0.01)

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = A2C(mdp.info, policy, **algorithm_params)

    core = Core(agent, mdp)
    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return agent
Пример #6
0
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
               n_step_test, alg_params, policy_params):
    logger = Logger(A2C.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + A2C.__name__)

    mdp = Gym(env_id, horizon, gamma)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         n_features=64,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    alg_params['critic_params'] = critic_params

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    dataset = core.evaluate(n_steps=n_step_test, render=False)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy()

    logger.epoch_info(0, J=J, R=R, entropy=E)

    for it in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy()

        logger.epoch_info(it + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize')
    input()
    core.evaluate(n_episodes=5, render=True)
Пример #7
0
def experiment(alpha):
    np.random.seed()

    # MDP
    mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    n_tilings = 10
    tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high)
    features = Features(tilings=tilings)

    learning_rate = Parameter(alpha / n_tilings)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9}

    agent = TrueOnlineSARSALambda(mdp.info,
                                  pi,
                                  approximator_params=approximator_params,
                                  features=features,
                                  **algorithm_params)

    #shape = agent.approximator.Q
    #print(agent.Q)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=10, n_steps_per_fit=1, render=True)
    dataset = core.evaluate(n_episodes=1, render=False)
    #print(dataset)
    print(episodes_length(dataset))

    return np.mean(compute_J(dataset, .96))
Пример #8
0
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.parameters import Parameter
from mushroom_rl.environments import GridWorld
from mushroom_rl.algorithms.value import QLearning
from mushroom_rl.core.core import Core
import numpy as np
from mushroom_rl.environments import Gym

mdp = Gym(name='FrozenLake-v0', horizon=np.inf, gamma=1.)

epsilon = Parameter(value=1.)
policy = EpsGreedy(epsilon=epsilon)

learning_rate = Parameter(value=.6)
agent = QLearning(mdp.info, policy, learning_rate)

core = Core(agent, mdp)
core.learn(n_steps=10000, n_steps_per_fit=1)

shape = agent.approximator.shape
q = np.zeros(shape)
print(q)
for i in range(shape[0]):
    for j in range(shape[1]):
        state = np.array([i])
        action = np.array([j])
        q[i, j] = agent.approximator.predict(state, action)
print(q)
Пример #9
0
def test_normalizing_preprocessor(tmpdir):
    np.random.seed(88)

    class Network(nn.Module):
        def __init__(self, input_shape, output_shape, **kwargs):
            super().__init__()

            n_input = input_shape[-1]
            n_output = output_shape[0]

            self._h1 = nn.Linear(n_input, n_output)

            nn.init.xavier_uniform_(self._h1.weight,
                                    gain=nn.init.calculate_gain('relu'))

        def forward(self, state, action=None):
            q = F.relu(self._h1(torch.squeeze(state, 1).float()))
            if action is None:
                return q
            else:
                action = action.long()
                q_acted = torch.squeeze(q.gather(1, action))
                return q_acted

    mdp = Gym('CartPole-v0', horizon=500, gamma=.99)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape

    approximator_params = dict(network=Network,
                               optimizer={'class':  optim.Adam,
                                          'params': {'lr': .001}},
                               loss=F.smooth_l1_loss,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n,
                               n_features=2, use_cuda=False)

    alg_params = dict(batch_size=5, initial_replay_size=10,
                      max_replay_size=500, target_update_frequency=50)

    agent = DQN(mdp.info, pi, TorchApproximator,
                approximator_params=approximator_params, **alg_params)

    norm_box = MinMaxPreprocessor(mdp_info=mdp.info,
                                  clip_obs=5.0, alpha=0.001)

    core = Core(agent, mdp, preprocessors=[norm_box])

    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    # training correctly
    assert (core._state.min() >= -norm_box._clip_obs
            and core._state.max() <= norm_box._clip_obs)

    # loading and setting data correctly
    state_dict1 = norm_box.get_state()
    norm_box.save(tmpdir / 'norm_box.msh')

    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    norm_box = MinMaxPreprocessor.load(tmpdir / 'norm_box.msh')
    state_dict2 = norm_box.get_state()

    assert ((state_dict1["mean"] == state_dict2["mean"]).all()
            and (state_dict1["var"] == state_dict2["var"]).all()
            and state_dict1["count"] == state_dict2["count"])

    core = Core(agent, mdp, preprocessors=[norm_box])
    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
Пример #10
0
import numpy as np

from mushroom_rl.algorithms.value import SARSALambdaContinuous
from mushroom_rl.approximators.parametric import LinearApproximator
from mushroom_rl.core import Core
from mushroom_rl.features import Features
from mushroom_rl.features.tiles import Tiles
from mushroom_rl.policy import EpsGreedy
from mushroom_rl.utils.callbacks import CollectDataset
from mushroom_rl.utils.parameters import Parameter
from mushroom_rl.environments import Gym

# MDP
mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.)

# Policy
epsilon = Parameter(value=0.)
pi = EpsGreedy(epsilon=epsilon)

# Q-function approximator
n_tilings = 10
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low,
                         mdp.info.observation_space.high)
features = Features(tilings=tilings)

approximator_params = dict(input_shape=(features.size, ),
                           output_shape=(mdp.info.action_space.n, ),
                           n_actions=mdp.info.action_space.n)

# Agent
learning_rate = Parameter(.1 / n_tilings)
Пример #11
0
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test):
    np.random.seed()

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    policy_params = dict(n_features=32, use_cuda=False)

    beta = Parameter(1e0)
    pi = BoltzmannTorchPolicy(Network,
                              mdp.info.observation_space.shape,
                              (mdp.info.action_space.n, ),
                              beta=beta,
                              **policy_params)

    # Agent
    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 1e-3,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         n_features=32,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    alg_params = dict(
        actor_optimizer={
            'class': optim.RMSprop,
            'params': {
                'lr': 1e-3,
                'eps': 3e-3
            }
        },
        critic_params=critic_params,
        #max_grad_norm=10.0,
        ent_coeff=0.01)

    agent = A2C(mdp.info, pi, **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)

    # RUN
    dataset = core.evaluate(n_steps=n_step_test, render=False)
    J = compute_J(dataset, gamma_eval)
    print('J: ', np.mean(J))

    for n in trange(n_epochs):
        tqdm.write('Epoch: ' + str(n))
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)
        J = compute_J(dataset, gamma_eval)
        tqdm.write('J: ' + str(np.mean(J)))
        # core.evaluate(n_episodes=2, render=True)

    print('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)