def test_full_replay_buffer(): """ Test if HER works correctly with a full replay buffer when using online sampling. It should not sample the current episode which is not finished. """ n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=True) # use small buffer size to get the buffer full model = HER( "MlpPolicy", env, SAC, goal_selection_strategy="future", online_sampling=True, gradient_steps=1, train_freq=4, max_episode_length=n_bits, policy_kwargs=dict(net_arch=[64]), learning_starts=1, buffer_size=20, verbose=1, ) model.learn(total_timesteps=100)
def test_performance_her(online_sampling, n_bits): """ That DQN+HER can solve BitFlippingEnv. It should not work when n_sampled_goal=0 (DQN alone). """ env = BitFlippingEnv(n_bits=n_bits, continuous=False) model = HER( "MlpPolicy", env, DQN, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=1, learning_rate=5e-4, max_episode_length=n_bits, train_freq=1, learning_starts=100, exploration_final_eps=0.02, target_update_interval=500, seed=0, batch_size=32, ) model.learn(total_timesteps=5000, log_interval=50) # 90% training success assert np.mean(model.ep_success_buffer) > 0.90
def test_get_max_episode_length(): dict_env = DummyVecEnv([lambda: BitFlippingEnv()]) # Cannot infer max epsiode length with pytest.raises(ValueError): get_time_limit(dict_env, current_max_episode_length=None) default_length = 10 assert get_time_limit( dict_env, current_max_episode_length=default_length) == default_length env = gym.make("CartPole-v1") vec_env = DummyVecEnv([lambda: env]) assert get_time_limit(vec_env, current_max_episode_length=None) == 500 # Overwrite max_episode_steps assert get_time_limit( vec_env, current_max_episode_length=default_length) == default_length # Set max_episode_steps to None env.spec.max_episode_steps = None vec_env = DummyVecEnv([lambda: env]) with pytest.raises(ValueError): get_time_limit(vec_env, current_max_episode_length=None) # Initialize HER and specify max_episode_length, should not raise an issue HER("MlpPolicy", dict_env, DQN, max_episode_length=5) with pytest.raises(ValueError): HER("MlpPolicy", dict_env, DQN) # Wrapped in a timelimit, should be fine # Note: it requires env.spec to be defined env = DummyVecEnv([lambda: gym.wrappers.TimeLimit(BitFlippingEnv(), 10)]) HER("MlpPolicy", env, DQN)
def __init__(self, paramters={}): self.paramters = paramters self.env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[256, 256, 256]) if "net_arch" in paramters: policy_kwargs["net_arch"] = paramters["net_arch"] self.model = HER( paramters.get("policy", 'MlpPolicy'), self.env, SAC, online_sampling=paramters.get("online_sampling", False), verbose=paramters.get("verbose", 0), max_episode_length=paramters.get("max_episode_length", 100), buffer_size=paramters.get("buffer_size", 1000000), batch_size=paramters.get("batch_size", 256), learning_rate=paramters.get("learning_rate", 0.001), learning_starts=paramters.get("learning_starts", 500), n_sampled_goal=paramters.get("n_sampled_goal", 4), gamma=paramters.get("gamma", 0.95), goal_selection_strategy=paramters.get("goal_selection_strategy", 'future'), ent_coef=paramters.get("ent_coef", 'auto'), policy_kwargs=policy_kwargs, train_freq=paramters.get("train_freq", 1), tensorboard_log=paramters.get("tensorboard_log", "./data/0_tensorboard/"))
def run(self, epochs=5000, train=False): # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * epochs)) self.model.save(self.name) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load(self.name, env=self.env) success_rate = [] for i in range(100): obs = self.env.reset() score = 0 success_rate.append(False) for j in range(1000): action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward success_rate[-1] = info["is_success"] # self.env.render() if done: break print("epoch: ", j) print("score:", score, "average score:", score / j) print("success rate: ", success_rate.count(True) / len(success_rate)) self.plot_success(success_rate, 2)
def __init__(self, env, model_class=DDPG): self.model_class = model_class # works also with SAC, DDPG and TD3 # Available strategies (cf paper): future, final, episode, random self.env = env self.goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE self.model = HER('MlpPolicy', self.env, self.model_class, n_sampled_goal=4, goal_selection_strategy=self.goal_selection_strategy, buffer_size=1000000, batch_size=256, gamma=.95, learning_rate=1e-3, verbose=1, max_episode_length=50)
class Model: """ Helper class for interactions with gym """ def __init__(self, paramters={}): self.paramters = paramters self.env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[256, 256, 256]) if "net_arch" in paramters: policy_kwargs["net_arch"] = paramters["net_arch"] self.model = HER( paramters.get("policy", 'MlpPolicy'), self.env, SAC, online_sampling=paramters.get("online_sampling", False), verbose=paramters.get("verbose", 0), max_episode_length=paramters.get("max_episode_length", 100), buffer_size=paramters.get("buffer_size", 1000000), batch_size=paramters.get("batch_size", 256), learning_rate=paramters.get("learning_rate", 0.001), learning_starts=paramters.get("learning_starts", 500), n_sampled_goal=paramters.get("n_sampled_goal", 4), gamma=paramters.get("gamma", 0.95), goal_selection_strategy=paramters.get("goal_selection_strategy", 'future'), ent_coef=paramters.get("ent_coef", 'auto'), policy_kwargs=policy_kwargs, train_freq=paramters.get("train_freq", 1), tensorboard_log=paramters.get("tensorboard_log", "./data/0_tensorboard/")) def learn(self, iterations: int): self.model.learn(iterations) def evaluate(self): test_env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) obs = test_env.reset() results = evaluate_policy(self.model, test_env, n_eval_episodes=75, return_episode_rewards=False) return results[0] def save(self, path="./data/0"): self.model.save()
def test_eval_success_logging(tmp_path): n_bits = 2 env = BitFlippingEnv(n_bits=n_bits) eval_env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=n_bits)]) eval_callback = EvalCallback( ObsDictWrapper(eval_env), eval_freq=250, log_path=tmp_path, warn=False, ) model = HER("MlpPolicy", env, DQN, learning_starts=100, seed=0, max_episode_length=n_bits) model.learn(500, callback=eval_callback) assert len(eval_callback._is_success_buffer) > 0 # More than 50% success rate assert np.mean(eval_callback._is_success_buffer) > 0.5
def test_goal_selection_strategy(goal_selection_strategy, online_sampling): """ Test different goal strategies. """ env = BitFlippingEnv(continuous=True) model = HER( "MlpPolicy", env, SAC, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, gradient_steps=1, train_freq=1, n_episodes_rollout=-1, max_episode_length=10, policy_kwargs=dict(net_arch=[64]), learning_starts=100, ) model.learn(total_timesteps=300)
def test_her(model_class, online_sampling): """ Test Hindsight Experience Replay. """ n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) model = HER( "MlpPolicy", env, model_class, goal_selection_strategy="future", online_sampling=online_sampling, gradient_steps=1, train_freq=4, max_episode_length=n_bits, policy_kwargs=dict(net_arch=[64]), learning_starts=100, ) model.learn(total_timesteps=300)
def train(params): # SAC hyperparams: # Create the action noise object that will be used for exploration n_actions = env.action_space.shape[0] noise_std = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model = HER(params.get("policy"), env, DDPG, n_sampled_goal=4, goal_selection_strategy=params.get("strategy"), online_sampling=True, verbose=1, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), action_noise=action_noise, tensorboard_log=log_dir, gamma=params.get("gamma"), batch_size=params.get("batch_size"), policy_kwargs=dict(net_arch=[256, 256, 256]), max_episode_length=100) # Train for 1e5 steps model.learn(params.get("train_steps")) # Save the trained agent model.save(exp_name)
def Main(): pp = pprint.PrettyPrinter(indent=4) #make environment and wrap env = gym.make('ur5e_reacher-v1') env = Monitor(env, filename="logs", allow_early_resets=True) #***define model*** #hyperparams # n_actions = env.action_space.shape[-1] # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_class = DDPG #kwargs are the parameters for DDPG model init kwargs = {"device": "cuda", "action_noise": NormalActionNoise} model = HER( 'MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, learning_rate=0.005, online_sampling=True, #max_episode_steps=4800 **kwargs) #train model train = False if train: model.learn(2 * 10e5) model.save("./her_ur5e_model/model_") #load model, not really necessary evaluate = True
def run(self, train_epochs=5000, train=False): # print("np.array(obs).shape: ", obs.shape) print("observation_space: ", self.env.observation_space) # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * train_epochs)) self.model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load('./her_bit_env_new', env=self.env) obs = self.env.get_observation_simulated() for i in range(1): obs = self.env.reset() score = 0 self.env.success_history.append(False) start = time.time() for j in range(1000): # obs needs simulated coords action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward if j != 49: self.env.success_history[-1] = done # self.env.success_history[-1] = done print("Distance history: ", self.env.distance_history[-1]) print("Success history: ", self.env.success_history[-1]) if done: end = time.time() self.env.time_history.append(end - start) break time.sleep(1) print("epoch: ", j) if j != 0: print("score:", score, "average score:", score / j) print("self.env.success_history[-1]: ", self.env.success_history[-1]) print( "success rate: ", self.env.success_history.count(True) / len(self.env.success_history)) return self.env.success_history, self.env.distance_history, self.env.time_history
def test_goal_selection_strategy(goal_selection_strategy, online_sampling): """ Test different goal strategies. """ env = BitFlippingEnv(continuous=True) normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) model = HER( "MlpPolicy", env, SAC, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, gradient_steps=1, train_freq=4, max_episode_length=10, policy_kwargs=dict(net_arch=[64]), learning_starts=100, action_noise=normal_action_noise, ) assert model.action_noise is not None model.learn(total_timesteps=300)
def evaluate(params): # Load saved model model = HER.load(exp_name, env=env) results = np.zeros(shape=(0,0)) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(params.get("test_episodes")): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) episode_reward += reward if done or info.get('is_success', False): episode_reward = 0.0 obs = env.reset() result = ("Reward:", episode_reward, "Success?", info.get('is_success', True)) results = np.append(results, result, axis=None)
def train(params): # SAC hyperparams: model = HER(params.get("policy"), env, SAC, n_sampled_goal=4, goal_selection_strategy=params.get("strategy"), online_sampling=True, verbose=1, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), tensorboard_log=log_dir, gamma=params.get("gamma"), batch_size=params.get("batch_size"), policy_kwargs=dict(net_arch=[256, 256, 256]), max_episode_length=100) # Train for 1e5 steps model.learn(params.get("train_steps")) # Save the trained agent model.save(exp_name)
def Main(): #define arguments for her env_id = 'ur5e_reacher-v1' model_class = DDPG goal_selection_strategy = 'future' env = gym.make(env_id) #define kwargs to be passed to HER and wrapped algo kwargs = { #"n_timesteps":10000, "policy": 'MlpPolicy', "model_class": DDPG, "n_sampled_goal": 4, "goal_selection_strategy": 'future', "buffer_size": 1000000, #"ent_coef": 'auto', "batch_size": 256, "gamma": 0.95, "learning_rate": 0.001, "learning_starts": 1000, "online_sampling": True, #"normalize": True } #In the future, read hyperparams from her.yml #kwargs = read_hyperparameters(env_id) model = HER(env=env, **kwargs) total_n_steps = 1e6 safe_freq = total_n_steps // 10 max_episode_length = 4000 n_episodes = total_n_steps // max_episode_length model.learn(4000) model.save("./her_ur5e_model/model_3") model = HER.load('./her_ur5e_model/model_3', env=env) all_cumulative_rewards = [] num_episodes = 5 num_timesteps = 4800 env.render() #each timestep lasts 1/240 s. for episode in range(num_episodes): obs = env.reset() epi_rewards = [] for t in range(num_timesteps): action, _ = model.predict(obs) obs, reward, done, info = env.step(action) #time.sleep(1/240) epi_rewards.append(reward) if t == num_timesteps - 1: done = True if done: #pp.pprint(info) obs = env.reset() cumulative_reward = sum(epi_rewards) all_cumulative_rewards.append(cumulative_reward) print("episode {} | cumulative reward : {}".format( episode, cumulative_reward)) print("all_cumulative_rewards: ") pp.pprint(all_cumulative_rewards)
env = gym.make('FetchReach-v1') policy_kwargs = dict( activation_fn=th.nn.ReLU, net_arch=[64, 64], ) model = HER(MlpPolicy, env, SAC, online_sampling=False, verbose=1, max_episode_length=100, buffer_size=1000000, batch_size=256, learning_rate=0.001, learning_starts=1000, gamma=0.95, ent_coef='auto', n_sampled_goal=4, goal_selection_strategy='future', policy_kwargs=policy_kwargs) model.learn(total_timesteps=30000) model.save("data/fetch_reach_sb") obs = env.reset() for _ in range(100): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action)
def test_save_load(tmp_path, model_class, use_sde, online_sampling): """ Test if 'save' and 'load' saves and loads model correctly """ if use_sde and model_class != SAC: pytest.skip("Only SAC has gSDE support") n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) kwargs = dict(use_sde=True) if use_sde else {} # create model model = HER("MlpPolicy", env, model_class, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=0, tau=0.05, batch_size=128, learning_rate=0.001, policy_kwargs=dict(net_arch=[64]), buffer_size=int(1e6), gamma=0.98, gradient_steps=1, train_freq=4, learning_starts=100, max_episode_length=n_bits, **kwargs) model.learn(total_timesteps=300) env.reset() observations_list = [] for _ in range(10): obs = env.step(env.action_space.sample())[0] observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) observations = np.array(observations_list) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # test custom_objects # Load with custom objects custom_objects = dict(learning_rate=2e-5, dummy=1.0) model_ = HER.load(str(tmp_path / "test_save.zip"), env=env, custom_objects=custom_objects, verbose=2) assert model_.verbose == 2 # Check that the custom object was taken into account assert model_.learning_rate == custom_objects["learning_rate"] # Check that only parameters that are here already are replaced assert not hasattr(model_, "dummy") model = HER.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=300) # Test that the change of parameters works model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0) assert model.model.learning_rate == 2.0 assert model.verbose == 3 # clear file from os os.remove(tmp_path / "test_save.zip")
import gym import ur5e_env from stable_baselines3 import HER, DDPG, DQN, SAC, TD3 import time import os from stable_baselines3.common.vec_env import DummyVecEnv, VecEnvWrapper, VecVideoRecorder env = gym.make("ur5e_reacher-v1") model = HER.load('./logs/her/ur5e_reacher-v1_5/rl_model_2800000_steps', env=env) # video_length= 2000 # video_folder = "." # env = DummyVecEnv(env) # env = VecVideoRecorder(cd # env, # video_folder, # record_video_trigger=lambda x: x == 0, # video_length=video_length, # name_prefix="test_video" # ) #model = HER.load('./logs/Results/rl_model_50000_steps-v16', env=env) env.render() for episode in range(10): obs = env.reset() episodic_reward = 0 for timestep in range(1000): #time.sleep(1/90) action, _ = model.predict(obs) obs, reward, done, info = env.step(action) episodic_reward += reward #if reward > 0:
def test_import_error(): with pytest.raises(ImportError) as excinfo: from stable_baselines3 import HER HER("MlpPolicy") assert "documentation" in str(excinfo.value)
# goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # If True the HER transitions will get sampled online online_sampling = True # Time limit for the episodes max_episode_length = 50 action_noise = NormalActionNoise(mean=np.zeros(1), sigma=0.3 * np.ones(1)) # Initialize the model model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, action_noise=action_noise, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, verbose=1, max_episode_length=max_episode_length, tensorboard_log="./her_overcooked/") model = HER.load('./her_bit_env40.zip', env=env) obs = env.reset() for i in range(1000): action, _ = model.model.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action) import time time.sleep(0.5) system("clear")
class DDPG_HER: def __init__(self, env, model_class=DDPG, name="./her_bit_env"): self.model_class = model_class # works also with SAC, DDPG and TD3 # Available strategies (cf paper): future, final, episode, random self.env = env self.name = name self.goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE self.model = HER('MlpPolicy', self.env, self.model_class, n_sampled_goal=4, goal_selection_strategy=self.goal_selection_strategy, buffer_size=1000000, batch_size=256, gamma=.95, learning_rate=1e-3, verbose=1, max_episode_length=50) def run(self, epochs=5000, train=False): # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * epochs)) self.model.save(self.name) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load(self.name, env=self.env) success_rate = [] for i in range(100): obs = self.env.reset() score = 0 success_rate.append(False) for j in range(1000): action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward success_rate[-1] = info["is_success"] # self.env.render() if done: break print("epoch: ", j) print("score:", score, "average score:", score / j) print("success rate: ", success_rate.count(True) / len(success_rate)) self.plot_success(success_rate, 2) def plot_success(self, success_rate, plot_num): average = [] for i, point in enumerate(success_rate): average.append(success_rate[:i + 1].count(True) / (i + 1)) plt.plot(success_rate, color='blue', label="Epoch Success Rate") plt.plot(average, color='red', label="Average Success Rate", zorder=3) plt.legend() plt.title("Success Rate for Simulated FetchReach") plt.ylabel("Success Rate") plt.xlabel("Number iterations") plt.savefig("./plots/success/success_rate_{}.png".format(plot_num)) plt.clf()
def test_save_load_replay_buffer(tmp_path, recwarn, online_sampling, truncate_last_trajectory): """ Test if 'save_replay_buffer' and 'load_replay_buffer' works correctly """ # remove gym warnings warnings.filterwarnings(action="ignore", category=DeprecationWarning) warnings.filterwarnings(action="ignore", category=UserWarning, module="gym") path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl") path.parent.mkdir(exist_ok=True, parents=True) # to not raise a warning env = BitFlippingEnv(n_bits=4, continuous=True) model = HER( "MlpPolicy", env, SAC, goal_selection_strategy="future", online_sampling=online_sampling, gradient_steps=1, train_freq=4, max_episode_length=4, buffer_size=int(2e4), policy_kwargs=dict(net_arch=[64]), seed=0, ) model.learn(200) old_replay_buffer = deepcopy(model.replay_buffer) model.save_replay_buffer(path) del model.model.replay_buffer with pytest.raises(AttributeError): model.replay_buffer # Check that there is no warning assert len(recwarn) == 0 model.load_replay_buffer(path, truncate_last_trajectory) if truncate_last_trajectory: assert len(recwarn) == 1 warning = recwarn.pop(UserWarning) assert "The last trajectory in the replay buffer will be truncated" in str( warning.message) else: assert len(recwarn) == 0 if online_sampling: n_episodes_stored = model.replay_buffer.n_episodes_stored assert np.allclose( old_replay_buffer.buffer["observation"][:n_episodes_stored], model.replay_buffer.buffer["observation"][:n_episodes_stored], ) assert np.allclose( old_replay_buffer.buffer["next_obs"][:n_episodes_stored], model.replay_buffer.buffer["next_obs"][:n_episodes_stored], ) assert np.allclose( old_replay_buffer.buffer["action"][:n_episodes_stored], model.replay_buffer.buffer["action"][:n_episodes_stored]) assert np.allclose( old_replay_buffer.buffer["reward"][:n_episodes_stored], model.replay_buffer.buffer["reward"][:n_episodes_stored]) # we might change the last done of the last trajectory so we don't compare it assert np.allclose( old_replay_buffer.buffer["done"][:n_episodes_stored - 1], model.replay_buffer.buffer["done"][:n_episodes_stored - 1], ) else: assert np.allclose(old_replay_buffer.observations, model.replay_buffer.observations) assert np.allclose(old_replay_buffer.actions, model.replay_buffer.actions) assert np.allclose(old_replay_buffer.rewards, model.replay_buffer.rewards) assert np.allclose(old_replay_buffer.dones, model.replay_buffer.dones) # test if continuing training works properly reset_num_timesteps = False if truncate_last_trajectory is False else True model.learn(200, reset_num_timesteps=reset_num_timesteps)
# Time limit for the episodes max_episode_length = 50 action_noise = NormalActionNoise(mean=np.zeros(1), sigma=0.3 * np.ones(1)) # Initialize the model model = HER( "MlpPolicy", env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper # we have to manually specify the max number of steps per episode max_episode_length=max_episode_length, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, tensorboard_log="./her_overcooked", batch_size=256, online_sampling=online_sampling, action_noise=action_noise, # policy_kwargs=dict(net_arch=[256, 256, 256]), ) # model = HER.load('./her_bit_env250.zip', env=env) # Train the model for i in range(1000): model.learn(10000) model.save(f"./her_bit_env{i}")
env = gym.make("parking-v0") # Create 4 artificial transitions per real transition n_sampled_goal = 4 # SAC hyperparams: model = HER( "MlpPolicy", env, SAC, n_sampled_goal=n_sampled_goal, goal_selection_strategy="future", # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper # we have to manually specify the max number of steps per episode max_episode_length=100, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256, online_sampling=True, policy_kwargs=dict(net_arch=[256, 256, 256]), ) model.learn(int(2e5)) model.save("her_sac_highway") # Load saved model model = HER.load("her_sac_highway", env=env)
class DDPG_HER: def __init__(self, env, model_class=DDPG): self.model_class = model_class # works also with SAC, DDPG and TD3 # Available strategies (cf paper): future, final, episode, random self.env = env self.goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE self.model = HER('MlpPolicy', self.env, self.model_class, n_sampled_goal=4, goal_selection_strategy=self.goal_selection_strategy, buffer_size=1000000, batch_size=256, gamma=.95, learning_rate=1e-3, verbose=1, max_episode_length=50) def run(self, train_epochs=5000, train=False): # print("np.array(obs).shape: ", obs.shape) print("observation_space: ", self.env.observation_space) # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * train_epochs)) self.model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load('./her_bit_env_new', env=self.env) obs = self.env.get_observation_simulated() for i in range(1): obs = self.env.reset() score = 0 self.env.success_history.append(False) start = time.time() for j in range(1000): # obs needs simulated coords action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward if j != 49: self.env.success_history[-1] = done # self.env.success_history[-1] = done print("Distance history: ", self.env.distance_history[-1]) print("Success history: ", self.env.success_history[-1]) if done: end = time.time() self.env.time_history.append(end - start) break time.sleep(1) print("epoch: ", j) if j != 0: print("score:", score, "average score:", score / j) print("self.env.success_history[-1]: ", self.env.success_history[-1]) print( "success rate: ", self.env.success_history.count(True) / len(self.env.success_history)) return self.env.success_history, self.env.distance_history, self.env.time_history
eval_env = gym.make(env_id) n_actions = eval_env.action_space.shape[0] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.2 * np.ones(n_actions)) model = HER('MlpPolicy', vec_env, MODEL_CLASS, n_sampled_goal=4, goal_selection_strategy='future', online_sampling=True, verbose=1, action_noise=action_noise, gamma=.978, tau=.95, buffer_size=int(1e7), batch_size=512, learning_starts=10000, train_freq=1000, gradient_steps=1000, policy_kwargs=dict(net_arch=[350, 400, 350], optimizer_kwargs=dict(weight_decay=1.))) eval_rews = [] eval_accs = [] for i in range(0, NUM_EPOCHS): model = model.learn(total_timesteps=NUM_TIMESTEPS, reset_num_timesteps=False) eval_rew, eval_acc = evaluate_agent(model)