def train(params): # SAC hyperparams: # Create the action noise object that will be used for exploration n_actions = env.action_space.shape[0] noise_std = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model = HER(params.get("policy"), env, DDPG, n_sampled_goal=4, goal_selection_strategy=params.get("strategy"), online_sampling=True, verbose=1, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), action_noise=action_noise, tensorboard_log=log_dir, gamma=params.get("gamma"), batch_size=params.get("batch_size"), policy_kwargs=dict(net_arch=[256, 256, 256]), max_episode_length=100) # Train for 1e5 steps model.learn(params.get("train_steps")) # Save the trained agent model.save(exp_name)
def Main(): pp = pprint.PrettyPrinter(indent=4) #make environment and wrap env = gym.make('ur5e_reacher-v1') env = Monitor(env, filename="logs", allow_early_resets=True) #***define model*** #hyperparams # n_actions = env.action_space.shape[-1] # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_class = DDPG #kwargs are the parameters for DDPG model init kwargs = {"device": "cuda", "action_noise": NormalActionNoise} model = HER( 'MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, learning_rate=0.005, online_sampling=True, #max_episode_steps=4800 **kwargs) #train model train = False if train: model.learn(2 * 10e5) model.save("./her_ur5e_model/model_") #load model, not really necessary evaluate = True
def train(params): # SAC hyperparams: model = HER(params.get("policy"), env, SAC, n_sampled_goal=4, goal_selection_strategy=params.get("strategy"), online_sampling=True, verbose=1, buffer_size=params.get("buffer_size"), learning_rate=params.get("learning_rate"), tensorboard_log=log_dir, gamma=params.get("gamma"), batch_size=params.get("batch_size"), policy_kwargs=dict(net_arch=[256, 256, 256]), max_episode_length=100) # Train for 1e5 steps model.learn(params.get("train_steps")) # Save the trained agent model.save(exp_name)
class Model: """ Helper class for interactions with gym """ def __init__(self, paramters={}): self.paramters = paramters self.env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[256, 256, 256]) if "net_arch" in paramters: policy_kwargs["net_arch"] = paramters["net_arch"] self.model = HER( paramters.get("policy", 'MlpPolicy'), self.env, SAC, online_sampling=paramters.get("online_sampling", False), verbose=paramters.get("verbose", 0), max_episode_length=paramters.get("max_episode_length", 100), buffer_size=paramters.get("buffer_size", 1000000), batch_size=paramters.get("batch_size", 256), learning_rate=paramters.get("learning_rate", 0.001), learning_starts=paramters.get("learning_starts", 500), n_sampled_goal=paramters.get("n_sampled_goal", 4), gamma=paramters.get("gamma", 0.95), goal_selection_strategy=paramters.get("goal_selection_strategy", 'future'), ent_coef=paramters.get("ent_coef", 'auto'), policy_kwargs=policy_kwargs, train_freq=paramters.get("train_freq", 1), tensorboard_log=paramters.get("tensorboard_log", "./data/0_tensorboard/")) def learn(self, iterations: int): self.model.learn(iterations) def evaluate(self): test_env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) obs = test_env.reset() results = evaluate_policy(self.model, test_env, n_eval_episodes=75, return_episode_rewards=False) return results[0] def save(self, path="./data/0"): self.model.save()
activation_fn=th.nn.ReLU, net_arch=[64, 64], ) model = HER(MlpPolicy, env, SAC, online_sampling=False, verbose=1, max_episode_length=100, buffer_size=1000000, batch_size=256, learning_rate=0.001, learning_starts=1000, gamma=0.95, ent_coef='auto', n_sampled_goal=4, goal_selection_strategy='future', policy_kwargs=policy_kwargs) model.learn(total_timesteps=30000) model.save("data/fetch_reach_sb") obs = env.reset() for _ in range(100): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() if done: obs = env.reset()
def test_save_load(tmp_path, model_class, use_sde, online_sampling): """ Test if 'save' and 'load' saves and loads model correctly """ if use_sde and model_class != SAC: pytest.skip("Only SAC has gSDE support") n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) kwargs = dict(use_sde=True) if use_sde else {} # create model model = HER("MlpPolicy", env, model_class, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=0, tau=0.05, batch_size=128, learning_rate=0.001, policy_kwargs=dict(net_arch=[64]), buffer_size=int(1e6), gamma=0.98, gradient_steps=1, train_freq=4, learning_starts=100, max_episode_length=n_bits, **kwargs) model.learn(total_timesteps=300) env.reset() observations_list = [] for _ in range(10): obs = env.step(env.action_space.sample())[0] observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) observations = np.array(observations_list) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose( params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # test custom_objects # Load with custom objects custom_objects = dict(learning_rate=2e-5, dummy=1.0) model_ = HER.load(str(tmp_path / "test_save.zip"), env=env, custom_objects=custom_objects, verbose=2) assert model_.verbose == 2 # Check that the custom object was taken into account assert model_.learning_rate == custom_objects["learning_rate"] # Check that only parameters that are here already are replaced assert not hasattr(model_, "dummy") model = HER.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose( params[key], new_params[key] ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=300) # Test that the change of parameters works model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0) assert model.model.learning_rate == 2.0 assert model.verbose == 3 # clear file from os os.remove(tmp_path / "test_save.zip")
verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, tensorboard_log="./her_overcooked", batch_size=256, online_sampling=online_sampling, action_noise=action_noise, # policy_kwargs=dict(net_arch=[256, 256, 256]), ) # model = HER.load('./her_bit_env250.zip', env=env) # Train the model for i in range(1000): model.learn(10000) model.save(f"./her_bit_env{i}") # model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
n_sampled_goal=n_sampled_goal, goal_selection_strategy="future", # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper # we have to manually specify the max number of steps per episode max_episode_length=100, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256, online_sampling=True, policy_kwargs=dict(net_arch=[256, 256, 256]), ) model.learn(int(2e5)) model.save("her_sac_highway") # Load saved model model = HER.load("her_sac_highway", env=env) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?",
class DDPG_HER: def __init__(self, env, model_class=DDPG, name="./her_bit_env"): self.model_class = model_class # works also with SAC, DDPG and TD3 # Available strategies (cf paper): future, final, episode, random self.env = env self.name = name self.goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE self.model = HER('MlpPolicy', self.env, self.model_class, n_sampled_goal=4, goal_selection_strategy=self.goal_selection_strategy, buffer_size=1000000, batch_size=256, gamma=.95, learning_rate=1e-3, verbose=1, max_episode_length=50) def run(self, epochs=5000, train=False): # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * epochs)) self.model.save(self.name) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load(self.name, env=self.env) success_rate = [] for i in range(100): obs = self.env.reset() score = 0 success_rate.append(False) for j in range(1000): action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward success_rate[-1] = info["is_success"] # self.env.render() if done: break print("epoch: ", j) print("score:", score, "average score:", score / j) print("success rate: ", success_rate.count(True) / len(success_rate)) self.plot_success(success_rate, 2) def plot_success(self, success_rate, plot_num): average = [] for i, point in enumerate(success_rate): average.append(success_rate[:i + 1].count(True) / (i + 1)) plt.plot(success_rate, color='blue', label="Epoch Success Rate") plt.plot(average, color='red', label="Average Success Rate", zorder=3) plt.legend() plt.title("Success Rate for Simulated FetchReach") plt.ylabel("Success Rate") plt.xlabel("Number iterations") plt.savefig("./plots/success/success_rate_{}.png".format(plot_num)) plt.clf()
def Main(): #define arguments for her env_id = 'ur5e_reacher-v1' model_class = DDPG goal_selection_strategy = 'future' env = gym.make(env_id) #define kwargs to be passed to HER and wrapped algo kwargs = { #"n_timesteps":10000, "policy": 'MlpPolicy', "model_class": DDPG, "n_sampled_goal": 4, "goal_selection_strategy": 'future', "buffer_size": 1000000, #"ent_coef": 'auto', "batch_size": 256, "gamma": 0.95, "learning_rate": 0.001, "learning_starts": 1000, "online_sampling": True, #"normalize": True } #In the future, read hyperparams from her.yml #kwargs = read_hyperparameters(env_id) model = HER(env=env, **kwargs) total_n_steps = 1e6 safe_freq = total_n_steps // 10 max_episode_length = 4000 n_episodes = total_n_steps // max_episode_length model.learn(4000) model.save("./her_ur5e_model/model_3") model = HER.load('./her_ur5e_model/model_3', env=env) all_cumulative_rewards = [] num_episodes = 5 num_timesteps = 4800 env.render() #each timestep lasts 1/240 s. for episode in range(num_episodes): obs = env.reset() epi_rewards = [] for t in range(num_timesteps): action, _ = model.predict(obs) obs, reward, done, info = env.step(action) #time.sleep(1/240) epi_rewards.append(reward) if t == num_timesteps - 1: done = True if done: #pp.pprint(info) obs = env.reset() cumulative_reward = sum(epi_rewards) all_cumulative_rewards.append(cumulative_reward) print("episode {} | cumulative reward : {}".format( episode, cumulative_reward)) print("all_cumulative_rewards: ") pp.pprint(all_cumulative_rewards)
class DDPG_HER: def __init__(self, env, model_class=DDPG): self.model_class = model_class # works also with SAC, DDPG and TD3 # Available strategies (cf paper): future, final, episode, random self.env = env self.goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE self.model = HER('MlpPolicy', self.env, self.model_class, n_sampled_goal=4, goal_selection_strategy=self.goal_selection_strategy, buffer_size=1000000, batch_size=256, gamma=.95, learning_rate=1e-3, verbose=1, max_episode_length=50) def run(self, train_epochs=5000, train=False): # print("np.array(obs).shape: ", obs.shape) print("observation_space: ", self.env.observation_space) # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * train_epochs)) self.model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load('./her_bit_env_new', env=self.env) obs = self.env.get_observation_simulated() for i in range(1): obs = self.env.reset() score = 0 self.env.success_history.append(False) start = time.time() for j in range(1000): # obs needs simulated coords action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward if j != 49: self.env.success_history[-1] = done # self.env.success_history[-1] = done print("Distance history: ", self.env.distance_history[-1]) print("Success history: ", self.env.success_history[-1]) if done: end = time.time() self.env.time_history.append(end - start) break time.sleep(1) print("epoch: ", j) if j != 0: print("score:", score, "average score:", score / j) print("self.env.success_history[-1]: ", self.env.success_history[-1]) print( "success rate: ", self.env.success_history.count(True) / len(self.env.success_history)) return self.env.success_history, self.env.distance_history, self.env.time_history