def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) print('Epoch: ', n, ' J: ', np.mean(compute_J(dataset, mdp.info.gamma))) if save_states_to_disk: # save normalization / plot states to disk path os.makedirs("./temp/", exist_ok=True) prepro.save_state("./temp/normalization_state") plotter.save_state("./temp/plotting_state") # load states from disk path prepro.load_state("./temp/normalization_state") plotter.load_state("./temp/plotting_state")
def test_normalizing_preprocessor(): np.random.seed(88) class Network(nn.Module): def __init__(self, input_shape, output_shape, **kwargs): super().__init__() n_input = input_shape[-1] n_output = output_shape[0] self._h1 = nn.Linear(n_input, n_output) nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain('relu')) def forward(self, state, action=None): q = F.relu(self._h1(torch.squeeze(state, 1).float())) if action is None: return q else: action = action.long() q_acted = torch.squeeze(q.gather(1, action)) return q_acted mdp = Gym('CartPole-v0', horizon=500, gamma=.99) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) alg_params = dict(batch_size=5, n_approximators=1, initial_replay_size=10, max_replay_size=500, target_update_frequency=50) agent = DQN(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, **alg_params) norm_box = MinMaxPreprocessor(mdp_info=mdp.info, clip_obs=5.0, alpha=0.001) core = Core(agent, mdp, preprocessors=[norm_box]) core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) # training correctly assert (core._state.min() >= -norm_box.clip_obs and core._state.max() <= norm_box.clip_obs) # loading and setting data correctly state_dict1 = norm_box.get_state() norm_box.save_state("./run_norm_state") core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) norm_box.load_state("./run_norm_state") state_dict2 = norm_box.get_state() os.remove("./run_norm_state") assert ((state_dict1["mean"] == state_dict2["mean"]).all() and (state_dict1["var"] == state_dict2["var"]).all() and state_dict1["count"] == state_dict2["count"])