def run(self, epochs=5000, train=False): # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * epochs)) self.model.save(self.name) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load(self.name, env=self.env) success_rate = [] for i in range(100): obs = self.env.reset() score = 0 success_rate.append(False) for j in range(1000): action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward success_rate[-1] = info["is_success"] # self.env.render() if done: break print("epoch: ", j) print("score:", score, "average score:", score / j) print("success rate: ", success_rate.count(True) / len(success_rate)) self.plot_success(success_rate, 2)
def run(self, train_epochs=5000, train=False): # print("np.array(obs).shape: ", obs.shape) print("observation_space: ", self.env.observation_space) # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * train_epochs)) self.model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load('./her_bit_env_new', env=self.env) obs = self.env.get_observation_simulated() for i in range(1): obs = self.env.reset() score = 0 self.env.success_history.append(False) start = time.time() for j in range(1000): # obs needs simulated coords action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward if j != 49: self.env.success_history[-1] = done # self.env.success_history[-1] = done print("Distance history: ", self.env.distance_history[-1]) print("Success history: ", self.env.success_history[-1]) if done: end = time.time() self.env.time_history.append(end - start) break time.sleep(1) print("epoch: ", j) if j != 0: print("score:", score, "average score:", score / j) print("self.env.success_history[-1]: ", self.env.success_history[-1]) print( "success rate: ", self.env.success_history.count(True) / len(self.env.success_history)) return self.env.success_history, self.env.distance_history, self.env.time_history
def evaluate(params): # Load saved model model = HER.load(exp_name, env=env) results = np.zeros(shape=(0,0)) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(params.get("test_episodes")): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) episode_reward += reward if done or info.get('is_success', False): episode_reward = 0.0 obs = env.reset() result = ("Reward:", episode_reward, "Success?", info.get('is_success', True)) results = np.append(results, result, axis=None)
def test_save_load(tmp_path, model_class, use_sde, online_sampling): """ Test if 'save' and 'load' saves and loads model correctly """ if use_sde and model_class != SAC: pytest.skip("Only SAC has gSDE support") n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) kwargs = dict(use_sde=True) if use_sde else {} # create model model = HER("MlpPolicy", env, model_class, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=0, tau=0.05, batch_size=128, learning_rate=0.001, policy_kwargs=dict(net_arch=[64]), buffer_size=int(1e6), gamma=0.98, gradient_steps=1, train_freq=4, learning_starts=100, max_episode_length=n_bits, **kwargs) model.learn(total_timesteps=300) env.reset() observations_list = [] for _ in range(10): obs = env.step(env.action_space.sample())[0] observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) observations = np.array(observations_list) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # test custom_objects # Load with custom objects custom_objects = dict(learning_rate=2e-5, dummy=1.0) model_ = HER.load(str(tmp_path / "test_save.zip"), env=env, custom_objects=custom_objects, verbose=2) assert model_.verbose == 2 # Check that the custom object was taken into account assert model_.learning_rate == custom_objects["learning_rate"] # Check that only parameters that are here already are replaced assert not hasattr(model_, "dummy") model = HER.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=300) # Test that the change of parameters works model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0) assert model.model.learning_rate == 2.0 assert model.verbose == 3 # clear file from os os.remove(tmp_path / "test_save.zip")
# we have to manually specify the max number of steps per episode max_episode_length=100, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256, online_sampling=True, policy_kwargs=dict(net_arch=[256, 256, 256]), ) model.learn(int(2e5)) model.save("her_sac_highway") # Load saved model model = HER.load("her_sac_highway", env=env) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
online_sampling = True # Time limit for the episodes max_episode_length = 50 action_noise = NormalActionNoise(mean=np.zeros(1), sigma=0.3 * np.ones(1)) # Initialize the model model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, action_noise=action_noise, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, verbose=1, max_episode_length=max_episode_length, tensorboard_log="./her_overcooked/") model = HER.load('./her_bit_env40.zip', env=env) obs = env.reset() for i in range(1000): action, _ = model.model.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action) import time time.sleep(0.5) system("clear") if done or i % 20 == 0: obs = env.reset()
import gym import ur5e_env from stable_baselines3 import HER, DDPG, DQN, SAC, TD3 import time import os from stable_baselines3.common.vec_env import DummyVecEnv, VecEnvWrapper, VecVideoRecorder env = gym.make("ur5e_reacher-v1") model = HER.load('./logs/her/ur5e_reacher-v1_5/rl_model_2800000_steps', env=env) # video_length= 2000 # video_folder = "." # env = DummyVecEnv(env) # env = VecVideoRecorder(cd # env, # video_folder, # record_video_trigger=lambda x: x == 0, # video_length=video_length, # name_prefix="test_video" # ) #model = HER.load('./logs/Results/rl_model_50000_steps-v16', env=env) env.render() for episode in range(10): obs = env.reset() episodic_reward = 0 for timestep in range(1000): #time.sleep(1/90) action, _ = model.predict(obs) obs, reward, done, info = env.step(action) episodic_reward += reward #if reward > 0:
def Main(): #define arguments for her env_id = 'ur5e_reacher-v1' model_class = DDPG goal_selection_strategy = 'future' env = gym.make(env_id) #define kwargs to be passed to HER and wrapped algo kwargs = { #"n_timesteps":10000, "policy": 'MlpPolicy', "model_class": DDPG, "n_sampled_goal": 4, "goal_selection_strategy": 'future', "buffer_size": 1000000, #"ent_coef": 'auto', "batch_size": 256, "gamma": 0.95, "learning_rate": 0.001, "learning_starts": 1000, "online_sampling": True, #"normalize": True } #In the future, read hyperparams from her.yml #kwargs = read_hyperparameters(env_id) model = HER(env=env, **kwargs) total_n_steps = 1e6 safe_freq = total_n_steps // 10 max_episode_length = 4000 n_episodes = total_n_steps // max_episode_length model.learn(4000) model.save("./her_ur5e_model/model_3") model = HER.load('./her_ur5e_model/model_3', env=env) all_cumulative_rewards = [] num_episodes = 5 num_timesteps = 4800 env.render() #each timestep lasts 1/240 s. for episode in range(num_episodes): obs = env.reset() epi_rewards = [] for t in range(num_timesteps): action, _ = model.predict(obs) obs, reward, done, info = env.step(action) #time.sleep(1/240) epi_rewards.append(reward) if t == num_timesteps - 1: done = True if done: #pp.pprint(info) obs = env.reset() cumulative_reward = sum(epi_rewards) all_cumulative_rewards.append(cumulative_reward) print("episode {} | cumulative reward : {}".format( episode, cumulative_reward)) print("all_cumulative_rewards: ") pp.pprint(all_cumulative_rewards)