def learn(alg, alg_params): mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return agent
def test_a2c(): mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) policy_params = dict(std_0=1., n_features=64, use_cuda=False) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) algorithm_params = dict(critic_params=critic_params, actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) w = agent.policy.get_weights() w_test = np.array( [-1.6307759, 1.0356185, -0.34508315, 0.27108294, -0.01047843]) assert np.allclose(w, w_test)
def learn(alg, alg_params): class Network(nn.Module): def __init__(self, input_shape, output_shape, **kwargs): super(Network, self).__init__() n_input = input_shape[-1] n_output = output_shape[0] self._h = nn.Linear(n_input, n_output) nn.init.xavier_uniform_(self._h.weight, gain=nn.init.calculate_gain('relu')) def forward(self, state, **kwargs): return F.relu(self._h(torch.squeeze(state, 1).float())) mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': 3e-4}}, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1,)) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return policy
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_episodes_test, alg_params, policy_params): print(alg.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH 0') tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH ' + str(it + 1)) tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) print('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def learn_a2c(): mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) policy_params = dict(std_0=1., n_features=64, use_cuda=False) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) algorithm_params = dict(critic_params=critic_params, actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_step_test, alg_params, policy_params): logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=64, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params['critic_params'] = critic_params policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(0, J=J, R=R, entropy=E) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(it + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(mdp.info, pi, approximator_params=approximator_params, features=features, **algorithm_params) #shape = agent.approximator.Q #print(agent.Q) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_steps_per_fit=1, render=True) dataset = core.evaluate(n_episodes=1, render=False) #print(dataset) print(episodes_length(dataset)) return np.mean(compute_J(dataset, .96))
from mushroom_rl.policy import EpsGreedy from mushroom_rl.utils.parameters import Parameter from mushroom_rl.environments import GridWorld from mushroom_rl.algorithms.value import QLearning from mushroom_rl.core.core import Core import numpy as np from mushroom_rl.environments import Gym mdp = Gym(name='FrozenLake-v0', horizon=np.inf, gamma=1.) epsilon = Parameter(value=1.) policy = EpsGreedy(epsilon=epsilon) learning_rate = Parameter(value=.6) agent = QLearning(mdp.info, policy, learning_rate) core = Core(agent, mdp) core.learn(n_steps=10000, n_steps_per_fit=1) shape = agent.approximator.shape q = np.zeros(shape) print(q) for i in range(shape[0]): for j in range(shape[1]): state = np.array([i]) action = np.array([j]) q[i, j] = agent.approximator.predict(state, action) print(q)
def test_normalizing_preprocessor(tmpdir): np.random.seed(88) class Network(nn.Module): def __init__(self, input_shape, output_shape, **kwargs): super().__init__() n_input = input_shape[-1] n_output = output_shape[0] self._h1 = nn.Linear(n_input, n_output) nn.init.xavier_uniform_(self._h1.weight, gain=nn.init.calculate_gain('relu')) def forward(self, state, action=None): q = F.relu(self._h1(torch.squeeze(state, 1).float())) if action is None: return q else: action = action.long() q_acted = torch.squeeze(q.gather(1, action)) return q_acted mdp = Gym('CartPole-v0', horizon=500, gamma=.99) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) alg_params = dict(batch_size=5, initial_replay_size=10, max_replay_size=500, target_update_frequency=50) agent = DQN(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, **alg_params) norm_box = MinMaxPreprocessor(mdp_info=mdp.info, clip_obs=5.0, alpha=0.001) core = Core(agent, mdp, preprocessors=[norm_box]) core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) # training correctly assert (core._state.min() >= -norm_box._clip_obs and core._state.max() <= norm_box._clip_obs) # loading and setting data correctly state_dict1 = norm_box.get_state() norm_box.save(tmpdir / 'norm_box.msh') core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) norm_box = MinMaxPreprocessor.load(tmpdir / 'norm_box.msh') state_dict2 = norm_box.get_state() assert ((state_dict1["mean"] == state_dict2["mean"]).all() and (state_dict1["var"] == state_dict2["var"]).all() and state_dict1["count"] == state_dict2["count"]) core = Core(agent, mdp, preprocessors=[norm_box]) core.learn(n_steps=100, n_steps_per_fit=1, quiet=True)
import numpy as np from mushroom_rl.algorithms.value import SARSALambdaContinuous from mushroom_rl.approximators.parametric import LinearApproximator from mushroom_rl.core import Core from mushroom_rl.features import Features from mushroom_rl.features.tiles import Tiles from mushroom_rl.policy import EpsGreedy from mushroom_rl.utils.callbacks import CollectDataset from mushroom_rl.utils.parameters import Parameter from mushroom_rl.environments import Gym # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Q-function approximator n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) # Agent learning_rate = Parameter(.1 / n_tilings)
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test): np.random.seed() # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy policy_params = dict(n_features=32, use_cuda=False) beta = Parameter(1e0) pi = BoltzmannTorchPolicy(Network, mdp.info.observation_space.shape, (mdp.info.action_space.n, ), beta=beta, **policy_params) # Agent critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 1e-3, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params = dict( actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 1e-3, 'eps': 3e-3 } }, critic_params=critic_params, #max_grad_norm=10.0, ent_coeff=0.01) agent = A2C(mdp.info, pi, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) # RUN dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) print('J: ', np.mean(J)) for n in trange(n_epochs): tqdm.write('Epoch: ' + str(n)) core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) tqdm.write('J: ' + str(np.mean(J))) # core.evaluate(n_episodes=2, render=True) print('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)