示例#1
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(COPDAC_Q.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high, phi, phi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    logger = Logger('plot_and_norm_example', results_dir=None)
    logger.strong_line()
    logger.info('Plotting and normalization example')

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=.01)
    algorithm_params = dict(optimizer=optimizer)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        J = np.mean(compute_J(dataset, mdp.info.gamma))
        logger.epoch_info(n + 1, J=J)

    if save_states_to_disk:
        # save normalization / plot states to disk path
        logger.info('Saving plotting and normalization data')
        os.makedirs("./logs/plot_and_norm", exist_ok=True)
        prepro.save("./logs/plot_and_norm/preprocessor.msh")
        plotter.save_state("./logs/plot_and_norm/plotting_state")

        # load states from disk path
        logger.info('Loading preprocessor and plotter')
        prerpo = MinMaxPreprocessor.load(
            "./logs/plot_and_norm/preprocessor.msh")
        plotter.load_state("./logs/plot_and_norm/plotting_state")
示例#3
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()
    print('============ start experiment ============')
    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = GraspEnv()
    print('============ mdp ============')

    # Policy
    n_weights = 6
    mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0])
    sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1,
                        0.1])  #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4])
    policy = Own_policy()
    dist = GaussianDiagonalDistribution(
        mu, sigma)  # TODO: is this distribution right? Yes.
    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset(
    )  # TODO: should we also collect the dataset? Just keep this.
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])
    #core = Core(agent, mdp)

    for i in range(n_epochs):
        print('================ core learn ================')
        core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit)

        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        print('J:', J)
        print('============================')
        dataset_callback.clean()  # Todo: learning curve? Done

        p = dist.get_parameters()
        print('p:', p)
        mu_0.append(p[:n_weights][0])
        mu_1.append(p[:n_weights][1])
        mu_2.append(p[:n_weights][2])
        mu_3.append(p[:n_weights][3])
        mu_4.append(p[:n_weights][4])
        mu_5.append(p[:n_weights][5])

        current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] +
                             p[n_weights:][2] + p[n_weights:][3] +
                             p[n_weights:][4] + p[n_weights:][5]) / 6
        avg_sigma.append(current_avg_sigma)

        # record learning curve of cumulative rewards
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])
        list_J.append(np.mean(J))
示例#4
0
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit,
               n_step_test, alg_params, policy_params):
    logger = Logger(A2C.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + A2C.__name__)

    mdp = Gym(env_id, horizon, gamma)

    critic_params = dict(network=Network,
                         optimizer={
                             'class': optim.RMSprop,
                             'params': {
                                 'lr': 7e-4,
                                 'eps': 1e-5
                             }
                         },
                         loss=F.mse_loss,
                         n_features=64,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1, ))

    alg_params['critic_params'] = critic_params

    policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape,
                                 mdp.info.action_space.shape, **policy_params)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    dataset = core.evaluate(n_steps=n_step_test, render=False)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy()

    logger.epoch_info(0, J=J, R=R, entropy=E)

    for it in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy()

        logger.epoch_info(it + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize')
    input()
    core.evaluate(n_episodes=5, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = ShipSteering()

    # Policy
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 6]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 1

    tilings = Tiles.generate(n_tilings=n_tilings,
                             n_tiles=n_tiles,
                             low=low,
                             high=high)

    phi = Features(tilings=tilings)
    input_shape = (phi.size, )

    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 4e-1 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, features=phi, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_fit)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0, J=np.mean(J))

    for i in range(n_epochs):
        core.learn(n_episodes=fit_per_epoch * ep_per_fit,
                   n_episodes_per_fit=ep_per_fit)
        dataset_eval = core.evaluate(n_episodes=ep_per_fit)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(i + 1, J=np.mean(J))
示例#6
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])

    logger.info('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
示例#7
0
def experiment(alg, n_epochs, n_iterations, ep_per_run):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 0.25 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    optimizer = AdaptiveOptimizer(eps=1e-2)
    algorithm_params = dict(optimizer=optimizer)
    agent = alg(mdp.info, policy, **algorithm_params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0,
                      J=np.mean(J),
                      policy_weights=policy.get_weights().tolist())

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset_eval = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          policy_weights=policy.get_weights().tolist())
示例#8
0
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test):
    np.random.seed(1)

    logger = Logger('SAC', results_dir=None)
    logger.strong_line()
    logger.info('Humanoid Experiment, Algorithm: SAC')

    # MDP
    gamma = 0.99
    horizon = 2000
    mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles)

    # Agent
    agent = create_SAC_agent(mdp)

    # normalization callback
    normalizer = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info)

    # Algorithm(with normalization and plotting)
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer])
    dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

    J = np.mean(compute_J(dataset, gamma))
    L = int(np.round(np.mean(episodes_length(dataset))))

    logger.epoch_info(0, J=J, episode_lenght=L)

    # training loop
    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=True)

        J = np.mean(compute_J(dataset, gamma))
        L = int(np.round(np.mean(episodes_length(dataset))))


        logger.epoch_info(n+1, J=J, episode_lenght=L)

    logger.info('Press a button to visualize humanoid')
    input()
    core.evaluate(n_episodes=10, render=True)
示例#9
0
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = LQR.generate(dimensions=1)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.eye(policy.weights_size)
    distribution = GaussianCholeskyDistribution(mu, sigma)

    # Agent
    agent = alg(mdp.info, distribution, policy, **params)

    # Train
    core = Core(agent, mdp)
    dataset_eval = core.evaluate(n_episodes=ep_per_fit)
    J = compute_J(dataset_eval, gamma=mdp.info.gamma)
    logger.epoch_info(0,
                      J=np.mean(J),
                      distribution_parameters=distribution.get_parameters())

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=fit_per_epoch * ep_per_fit,
                   n_episodes_per_fit=ep_per_fit)
        dataset_eval = core.evaluate(n_episodes=ep_per_fit)
        J = compute_J(dataset_eval, gamma=mdp.info.gamma)
        logger.epoch_info(
            i + 1,
            J=np.mean(J),
            distribution_parameters=distribution.get_parameters())
def experiment(alg, n_epochs, n_steps, n_episodes_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    gamma = 0.99
    habitat_root_path = Habitat.root_path()
    config_file = os.path.join(
        habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml')
    base_config_file = os.path.join(habitat_root_path,
                                    'configs/tasks/rearrange/pick.yaml')
    wrapper = 'HabitatRearrangeWrapper'
    mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}}

    critic_input_shape = actor_input_shape + mdp.info.action_space.shape
    critic_params = dict(network=CriticNetwork,
                         optimizer={
                             'class': optim.Adam,
                             'params': {
                                 'lr': 3e-4
                             }
                         },
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1, ),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info,
                actor_mu_params,
                actor_sigma_params,
                actor_optimizer,
                critic_params,
                batch_size,
                initial_replay_size,
                max_replay_size,
                warmup_transitions,
                tau,
                lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_episodes=n_episodes_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n + 1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize the robot')
    input()
    core.evaluate(n_episodes=5, render=True)
示例#11
0
def experiment(alg, n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    use_cuda = torch.cuda.is_available()

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v1', horizon, gamma)

    # Policy
    policy_class = OrnsteinUhlenbeckPolicy
    policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    batch_size = 200
    n_features = 80
    tau = .001

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_params = dict(network=ActorNetwork,
                        n_features=n_features,
                        input_shape=actor_input_shape,
                        output_shape=mdp.info.action_space.shape,
                        use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': .001}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': .001}},
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info, policy_class, policy_params,
                actor_params, actor_optimizer, critic_params, batch_size,
                initial_replay_size, max_replay_size, tau)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = np.mean(compute_J(dataset, gamma))
    R = np.mean(compute_J(dataset))

    logger.epoch_info(0, J=J, R=R)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = np.mean(compute_J(dataset, gamma))
        R = np.mean(compute_J(dataset))

        logger.epoch_info(n+1, J=J, R=R)

    logger.info('Press a button to visualize pendulum')
    input()
    core.evaluate(n_episodes=5, render=True)
示例#12
0
mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9)
epsilon = Parameter(value=.15)
pi = EpsGreedy(epsilon=epsilon)
agent = QLearning(mdp.info, pi, learning_rate=Parameter(value=.2))
core = Core(agent, mdp)
epochs = 10

# Initial policy Evaluation
logger.info('Experiment started')
logger.strong_line()

dataset = core.evaluate(n_steps=100)
J = np.mean(compute_J(dataset, mdp.info.gamma))  # Discounted returns
R = np.mean(compute_J(dataset))  # Undiscounted returns

logger.epoch_info(0, J=J, R=R, any_label='any value')

for i in trange(epochs):
    # Here some learning
    core.learn(n_steps=100, n_steps_per_fit=1)
    sleep(0.5)
    dataset = core.evaluate(n_steps=100)
    sleep(0.5)
    J = np.mean(compute_J(dataset, mdp.info.gamma))  # Discounted returns
    R = np.mean(compute_J(dataset))  # Undiscounted returns

    # Here logging epoch results to the console
    logger.epoch_info(i+1, J=J, R=R)

    # Logging the data in J.npy and E.npy
    logger.log_numpy(J=J, R=R)
示例#13
0
def experiment(alg, n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    horizon = 200
    gamma = 0.99
    mdp = Gym('Pendulum-v1', horizon, gamma)

    # Settings
    initial_replay_size = 64
    max_replay_size = 50000
    batch_size = 64
    n_features = 64
    warmup_transitions = 100
    tau = 0.005
    lr_alpha = 3e-4

    use_cuda = torch.cuda.is_available()

    # Approximator
    actor_input_shape = mdp.info.observation_space.shape
    actor_mu_params = dict(network=ActorNetwork,
                           n_features=n_features,
                           input_shape=actor_input_shape,
                           output_shape=mdp.info.action_space.shape,
                           use_cuda=use_cuda)
    actor_sigma_params = dict(network=ActorNetwork,
                              n_features=n_features,
                              input_shape=actor_input_shape,
                              output_shape=mdp.info.action_space.shape,
                              use_cuda=use_cuda)

    actor_optimizer = {'class': optim.Adam,
                       'params': {'lr': 3e-4}}

    critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
    critic_params = dict(network=CriticNetwork,
                         optimizer={'class': optim.Adam,
                                    'params': {'lr': 3e-4}},
                         loss=F.mse_loss,
                         n_features=n_features,
                         input_shape=critic_input_shape,
                         output_shape=(1,),
                         use_cuda=use_cuda)

    # Agent
    agent = alg(mdp.info, actor_mu_params, actor_sigma_params,
                actor_optimizer, critic_params, batch_size, initial_replay_size,
                max_replay_size, warmup_transitions, tau, lr_alpha,
                critic_fit_params=None)

    # Algorithm
    core = Core(agent, mdp)

    # RUN
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    s, *_ = parse_dataset(dataset)

    J = np.mean(compute_J(dataset, mdp.info.gamma))
    R = np.mean(compute_J(dataset))
    E = agent.policy.entropy(s)

    logger.epoch_info(0, J=J, R=R, entropy=E)

    core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size)

    for n in trange(n_epochs, leave=False):
        core.learn(n_steps=n_steps, n_steps_per_fit=1)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        s, *_ = parse_dataset(dataset)

        J = np.mean(compute_J(dataset, mdp.info.gamma))
        R = np.mean(compute_J(dataset))
        E = agent.policy.entropy(s)

        logger.epoch_info(n+1, J=J, R=R, entropy=E)

    logger.info('Press a button to visualize pendulum')
    input()
    core.evaluate(n_episodes=5, render=True)
示例#14
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(StochasticAC_AVG.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + StochasticAC_AVG.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings - 1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator,
                    input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = StochasticAC_AVG(mdp.info,
                             policy,
                             alpha_theta,
                             alpha_v,
                             alpha_r,
                             lambda_par=.5,
                             value_function_features=psi,
                             policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high, phi, psi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
示例#15
0
def experiment(n_epochs, n_steps, n_steps_test):
    np.random.seed()

    logger = Logger(DQN.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + DQN.__name__)

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    epsilon = LinearParameter(value=1., threshold_value=.01, n=5000)
    epsilon_test = Parameter(value=0.)
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Settings
    initial_replay_size = 500
    max_replay_size = 5000
    target_update_frequency = 100
    batch_size = 200
    n_features = 80
    train_frequency = 1

    # Approximator
    input_shape = mdp.info.observation_space.shape
    approximator_params = dict(network=Network,
                               optimizer={
                                   'class': optim.Adam,
                                   'params': {
                                       'lr': .001
                                   }
                               },
                               loss=F.smooth_l1_loss,
                               n_features=n_features,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n)

    # Agent
    agent = DQN(mdp.info,
                pi,
                TorchApproximator,
                approximator_params=approximator_params,
                batch_size=batch_size,
                initial_replay_size=initial_replay_size,
                max_replay_size=max_replay_size,
                target_update_frequency=target_update_frequency)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=initial_replay_size,
               n_steps_per_fit=initial_replay_size)

    # RUN
    pi.set_epsilon(epsilon_test)
    dataset = core.evaluate(n_steps=n_steps_test, render=False)
    J = compute_J(dataset, gamma_eval)
    logger.epoch_info(0, J=np.mean(J))

    for n in trange(n_epochs):
        pi.set_epsilon(epsilon)
        core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency)
        pi.set_epsilon(epsilon_test)
        dataset = core.evaluate(n_steps=n_steps_test, render=False)
        J = compute_J(dataset, gamma_eval)
        logger.epoch_info(n + 1, J=np.mean(J))

    logger.info('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)
示例#16
0
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test):
    np.random.seed()

    logger = Logger(A2C.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + A2C.__name__)

    # MDP
    horizon = 1000
    gamma = 0.99
    gamma_eval = 1.
    mdp = Gym('Acrobot-v1', horizon, gamma)

    # Policy
    policy_params = dict(
        n_features=32,
        use_cuda=False
    )

    beta = Parameter(1e0)
    pi = BoltzmannTorchPolicy(Network,
                              mdp.info.observation_space.shape,
                              (mdp.info.action_space.n,),
                              beta=beta,
                              **policy_params)

    # Agent
    critic_params = dict(network=Network,
                         optimizer={'class': optim.RMSprop,
                                    'params': {'lr': 1e-3,
                                               'eps': 1e-5}},
                         loss=F.mse_loss,
                         n_features=32,
                         batch_size=64,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=(1,))

    alg_params = dict(actor_optimizer={'class': optim.RMSprop,
                                       'params': {'lr': 1e-3,
                                                  'eps': 3e-3}},
                      critic_params=critic_params,
                      ent_coeff=0.01
                      )

    agent = A2C(mdp.info, pi, **alg_params)

    # Algorithm
    core = Core(agent, mdp)

    core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)

    # RUN
    dataset = core.evaluate(n_steps=n_step_test, render=False)
    J = compute_J(dataset, gamma_eval)
    logger.epoch_info(0, J=np.mean(J))

    for n in trange(n_epochs):
        core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit)
        dataset = core.evaluate(n_steps=n_step_test, render=False)
        J = compute_J(dataset, gamma_eval)
        logger.epoch_info(n+1, J=np.mean(J))

    logger.info('Press a button to visualize acrobot')
    input()
    core.evaluate(n_episodes=5, render=True)