示例#1
0
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk):
    np.random.seed()

    # MDP
    mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    # Agent
    learning_rate = AdaptiveParameter(value=.01)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = REINFORCE(mdp.info, policy, **algorithm_params)

    # normalization callback
    prepro = MinMaxPreprocessor(mdp_info=mdp.info)

    # plotting callback
    plotter = PlotDataset(mdp.info, obs_normalized=True)

    # Train
    core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro])

    # training loop
    for n in range(n_epochs):
        core.learn(n_episodes=n_iterations * ep_per_run,
                   n_episodes_per_fit=ep_per_run)
        dataset = core.evaluate(n_episodes=ep_per_run, render=False)
        print('Epoch: ', n, '  J: ', np.mean(compute_J(dataset,
                                                       mdp.info.gamma)))

    if save_states_to_disk:
        # save normalization / plot states to disk path
        os.makedirs("./temp/", exist_ok=True)
        prepro.save_state("./temp/normalization_state")
        plotter.save_state("./temp/plotting_state")

        # load states from disk path
        prepro.load_state("./temp/normalization_state")
        plotter.load_state("./temp/plotting_state")
示例#2
0
def test_normalizing_preprocessor():
    np.random.seed(88)

    class Network(nn.Module):
        def __init__(self, input_shape, output_shape, **kwargs):
            super().__init__()

            n_input = input_shape[-1]
            n_output = output_shape[0]

            self._h1 = nn.Linear(n_input, n_output)

            nn.init.xavier_uniform_(self._h1.weight,
                                    gain=nn.init.calculate_gain('relu'))

        def forward(self, state, action=None):
            q = F.relu(self._h1(torch.squeeze(state, 1).float()))
            if action is None:
                return q
            else:
                action = action.long()
                q_acted = torch.squeeze(q.gather(1, action))
                return q_acted

    mdp = Gym('CartPole-v0', horizon=500, gamma=.99)

    # Policy
    epsilon_random = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon_random)

    # Approximator
    input_shape = mdp.info.observation_space.shape

    approximator_params = dict(network=Network,
                               optimizer={'class':  optim.Adam,
                                          'params': {'lr': .001}},
                               loss=F.smooth_l1_loss,
                               input_shape=input_shape,
                               output_shape=mdp.info.action_space.size,
                               n_actions=mdp.info.action_space.n,
                               n_features=2, use_cuda=False)

    alg_params = dict(batch_size=5, n_approximators=1, initial_replay_size=10,
                      max_replay_size=500, target_update_frequency=50)

    agent = DQN(mdp.info, pi, TorchApproximator,
                approximator_params=approximator_params, **alg_params)

    norm_box = MinMaxPreprocessor(mdp_info=mdp.info,
                                  clip_obs=5.0, alpha=0.001)

    core = Core(agent, mdp, preprocessors=[norm_box])

    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    # training correctly
    assert (core._state.min() >= -norm_box.clip_obs
            and core._state.max() <= norm_box.clip_obs)

    # loading and setting data correctly
    state_dict1 = norm_box.get_state()
    norm_box.save_state("./run_norm_state")

    core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)

    norm_box.load_state("./run_norm_state")
    state_dict2 = norm_box.get_state()

    os.remove("./run_norm_state")
    assert ((state_dict1["mean"] == state_dict2["mean"]).all()
            and (state_dict1["var"] == state_dict2["var"]).all()
            and state_dict1["count"] == state_dict2["count"])