def learn(alg): mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = alg(mdp.info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau) # Algorithm core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent.policy
def test_gym(): np.random.seed(1) mdp = Gym('Acrobot-v1', 1000, .99) mdp.seed(1) mdp.reset() for i in range(10): ns, r, ab, _ = mdp.step([np.random.randint(mdp.info.action_space.n)]) ns_test = np.array([ 0.9996687, -0.02573896, 0.9839331, -0.17853762, -0.17821608, 0.5534913 ]) assert np.allclose(ns, ns_test)
def test_gym(): np.random.seed(1) mdp = Gym('Acrobot-v1', 1000, .99) mdp.seed(1) mdp.reset() for i in range(10): ns, r, ab, _ = mdp.step([np.random.randint(mdp.info.action_space.n)]) ns_test = np.array([ 0.99989477, 0.01450661, 0.97517825, -0.22142128, -0.02323116, 0.40630765 ]) assert np.allclose(ns, ns_test)
def experiment(alg, n_epochs, n_steps, n_steps_test, seed): np.random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) # MDP horizon = 1000 gamma = 0.99 mdp = Gym('Pendulum-ID-v1', horizon, gamma) mdp.seed(seed) rarhmm = torch.load( os.path.abspath(os.path.join(__file__, '..', '..')) + '/mushroom_rl/sds/envs/hybrid/models/neural_rarhmm_pendulum_cart.pkl', map_location='cpu') # mdp = Gym('Pendulum-v0', horizon, gamma) # Settings initial_replay_size = 512 max_replay_size = 50000 * 4 batch_size = 512 n_critic_features = 64 n_actor_features = 14 warmup_transitions = 512 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_actor_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_actor_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_critic_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None, rarhmm=rarhmm) option_switch_model = OptionSwitchingModel(rarhmm) # Algorithm core = OptionCore(agent, mdp, option_switch_model=option_switch_model) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) J_results = [] dataset_results = [] # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) gamma = 1 # set gamma to 1 to compute cummulated reward J = compute_J(dataset, gamma) print('J: ', np.mean(J)) J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)}) dataset_results.append(dataset) for n in range(n_epochs): print('Epoch: ', n) core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma) print('J: ', np.mean(J)) J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)}) dataset_results.append(dataset) print('Press a button to visualize pendulum') # input() return core.evaluate(n_episodes=1, render=False), J_results, dataset_results
def learn_sac(): # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v0', horizon, gamma) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 10 tau = 0.005 lr_alpha = 3e-4 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = SAC(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) core.learn(n_steps=2 * initial_replay_size, n_steps_per_fit=initial_replay_size) return agent
def test_sac(): # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v0', horizon, gamma) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 10 tau = 0.005 lr_alpha = 3e-4 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=False) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0], ) critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=False) # Agent agent = SAC(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) core.learn(n_steps=2 * initial_replay_size, n_steps_per_fit=initial_replay_size) w = agent.policy.get_weights() w_test = np.array([ 1.6998193, -0.732528, 1.2986078, -0.26860124, 0.5094043, -0.5001421, -0.18989229, -0.30646914 ]) assert np.allclose(w, w_test)
def experiment(alg, n_epochs, n_steps, n_steps_test, seed): torch.manual_seed(seed) np.random.seed(seed) # MDP horizon = 1000 gamma = 0.99 mdp = Gym('Pendulum-ID-v1', horizon, gamma) mdp.seed(seed) # mdp = Gym('Pendulum-v0', horizon, gamma) # Settings initial_replay_size = 512 max_replay_size = 50000 * 4 batch_size = 512 n_features = 64 warmup_transitions = 512 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': 3e-4}}, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) J_results = [] dataset_results = [] # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) gamma = 1 J = compute_J(dataset, gamma) print('J: ', np.mean(J)) J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)}) dataset_results.append(dataset) for n in range(n_epochs): print('Epoch: ', n) core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma) print('J: ', np.mean(J)) J_results.append({'J_mean': np.mean(J), 'J_std': np.std(J)}) dataset_results.append(dataset) print('Press a button to visualize pendulum') # input() return core.evaluate(n_episodes=1, render=False), J_results, dataset_results