Пример #1
0
def learn(alg):
    mdp = Gym('Pendulum-v0', 200, .99)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Policy
    policy_class = OrnsteinUhlenbeckPolicy
    policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    batch_size = 200
    n_features = 80
    tau = .001

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_params = dict(network=ActorNetwork,
                        n_features=n_features,
                        input_shape=actor_input_shape,
                        output_shape=mdp.info.action_space.shape,
                        use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': .001
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = alg(mdp.info, policy_class, policy_params, actor_params,
                actor_optimizer, critic_params, batch_size,
                initial_replay_size, max_replay_size, tau)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return agent.policy
Пример #2
0
def test_gym():
    np.random.seed(1)
    mdp = Gym('Acrobot-v1', 1000, .99)
    mdp.seed(1)
    mdp.reset()
    for i in range(10):
        ns, r, ab, _ = mdp.step([np.random.randint(mdp.info.action_space.n)])
    ns_test = np.array([
        0.9996687, -0.02573896, 0.9839331, -0.17853762, -0.17821608, 0.5534913
    ])

    assert np.allclose(ns, ns_test)
Пример #3
0
def test_gym():
    np.random.seed(1)
    mdp = Gym('Acrobot-v1', 1000, .99)
    mdp.seed(1)
    mdp.reset()
    for i in range(10):
        ns, r, ab, _ = mdp.step([np.random.randint(mdp.info.action_space.n)])
    ns_test = np.array([
        0.99989477, 0.01450661, 0.97517825, -0.22142128, -0.02323116,
        0.40630765
    ])

    assert np.allclose(ns, ns_test)
Пример #4
0
def experiment(alg, n_epochs, n_steps, n_steps_test, seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # MDP
    horizon = 1000
    gamma = 0.99
    mdp = Gym('Pendulum-ID-v1', horizon, gamma)
    mdp.seed(seed)
    rarhmm = torch.load(
        os.path.abspath(os.path.join(__file__, '..', '..')) +
        '/mushroom_rl/sds/envs/hybrid/models/neural_rarhmm_pendulum_cart.pkl',
        map_location='cpu')
    # mdp = Gym('Pendulum-v0', horizon, gamma)

    # Settings
    initial_replay_size = 512
    max_replay_size = 50000 * 4
    batch_size = 512
    n_critic_features = 64
    n_actor_features = 14
    warmup_transitions = 512
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_actor_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_actor_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_critic_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None,
                rarhmm=rarhmm)

    option_switch_model = OptionSwitchingModel(rarhmm)

    # Algorithm
    core = OptionCore(agent, mdp, option_switch_model=option_switch_model)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    J_results = []
    dataset_results = []
    # RUN

    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    gamma = 1  # set gamma to 1 to compute cummulated reward
    J = compute_J(dataset, gamma)
    print('J: ', np.mean(J))
    J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)})
    dataset_results.append(dataset)

    for n in range(n_epochs):
        print('Epoch: ', n)
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma)
        print('J: ', np.mean(J))
        J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)})
        dataset_results.append(dataset)

    print('Press a button to visualize pendulum')
    # input()
    return core.evaluate(n_episodes=1,
                         render=False), J_results, dataset_results
Пример #5
0
def learn_sac():

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v0', horizon, gamma)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 10
    tau = 0.005
    lr_alpha = 3e-4

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=False)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = SAC(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=2 * initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    return agent
Пример #6
0
def test_sac():
    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v0', horizon, gamma)
    mdp.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 10
    tau = 0.005
    lr_alpha = 3e-4

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=False)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=False)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] +
                          mdp.info.action_space.shape[0], )
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=False)

    # Agent
    agent = SAC(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=2 * initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    w = agent.policy.get_weights()
    w_test = np.array([
        1.6998193, -0.732528, 1.2986078, -0.26860124, 0.5094043, -0.5001421,
        -0.18989229, -0.30646914
    ])

    assert np.allclose(w, w_test)
def experiment(alg, n_epochs, n_steps, n_steps_test, seed):
    torch.manual_seed(seed)
    np.random.seed(seed)

    # MDP
    horizon = 1000
    gamma = 0.99
    mdp = Gym('Pendulum-ID-v1', horizon, gamma)
    mdp.seed(seed)
    # mdp = Gym('Pendulum-v0', horizon, gamma)

    # Settings
    initial_replay_size = 512
    max_replay_size = 50000 * 4
    batch_size = 512
    n_features = 64
    warmup_transitions = 512
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': 3e-4}},
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info, actor_mu_params, actor_sigma_params,
                actor_optimizer, critic_params, batch_size, initial_replay_size,
                max_replay_size, warmup_transitions, tau, lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    J_results = []
    dataset_results = []
    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    gamma = 1
    J = compute_J(dataset, gamma)
    print('J: ', np.mean(J))
    J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)})
    dataset_results.append(dataset)

    for n in range(n_epochs):
        print('Epoch: ', n)
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma)
        print('J: ', np.mean(J))
        J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)})
        dataset_results.append(dataset)

    print('Press a button to visualize pendulum')
    # input()
    return core.evaluate(n_episodes=1, render=False), J_results, dataset_results