class OUNoise: """Ornstein-Uhlenbeck process.""" def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2): """Initialize parameters and noise process.""" self.mu = mu * np.ones(size) self.theta = theta self.sigma = sigma self.seed = random.seed(seed) self.baseline_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(size), sigma=sigma * np.ones(size)) self.reset() def reset(self): """Reset the internal state (= noise) to mean (mu).""" # self.decay() self.state = copy.copy(self.mu) self.baseline_noise.reset() def sample(self): """Update internal state and return it as a noise sample.""" x = self.state dx = self.theta * (self.mu - x) + self.sigma * np.array( [random.random() for i in range(len(x))]) self.state = x + dx return self.baseline_noise() #self.state def decay(self): self.sigma = max(0.35, self.sigma * 0.99) self.theta = max(0.15, self.theta * 0.995)
def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2): """Initialize parameters and noise process.""" self.mu = mu * np.ones(size) self.theta = theta self.sigma = sigma self.seed = random.seed(seed) self.baseline_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(size), sigma=sigma * np.ones(size)) self.reset()
def _preprocess_action_noise( self, hyperparams: Dict[str, Any], saved_hyperparams: Dict[str, Any], env: VecEnv ) -> Dict[str, Any]: # Special case for HER algo = saved_hyperparams["model_class"] if self.algo == "her" else self.algo # Parse noise string if algo in ["ddpg", "sac", "td3", "tqc", "d3pg"] and hyperparams.get("noise_type") is not None: noise_type = hyperparams["noise_type"].strip() noise_std = hyperparams["noise_std"] # Save for later (hyperparameter optimization) self.n_actions = env.action_space.shape[0] if "normal" in noise_type: hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(self.n_actions), sigma=noise_std * np.ones(self.n_actions), ) elif "ornstein-uhlenbeck" in noise_type: hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(self.n_actions), sigma=noise_std * np.ones(self.n_actions), ) else: raise RuntimeError(f'Unknown noise type "{noise_type}"') print(f"Applying {noise_type} noise with std {noise_std}") del hyperparams["noise_type"] del hyperparams["noise_std"] return hyperparams
def test_vec_noise(): num_envs = 4 num_actions = 10 mu = np.zeros(num_actions) sigma = np.ones(num_actions) * 0.4 base: ActionNoise = OrnsteinUhlenbeckActionNoise(mu, sigma) with pytest.raises(ValueError): vec = VectorizedActionNoise(base, -1) with pytest.raises(ValueError): vec = VectorizedActionNoise(base, None) with pytest.raises(ValueError): vec = VectorizedActionNoise(base, "whatever") vec = VectorizedActionNoise(base, num_envs) assert vec.n_envs == num_envs assert vec().shape == (num_envs, num_actions) assert not (vec() == base()).all() with pytest.raises(ValueError): vec = VectorizedActionNoise(None, num_envs) with pytest.raises(TypeError): vec = VectorizedActionNoise(12, num_envs) with pytest.raises(AssertionError): vec.noises = [] with pytest.raises(TypeError): vec.noises = None with pytest.raises(ValueError): vec.noises = [None] * vec.n_envs with pytest.raises(AssertionError): vec.noises = [base] * (num_envs - 1) assert all(isinstance(noise, type(base)) for noise in vec.noises) assert len(vec.noises) == num_envs
def train_DDPG(self, model_name, model_params=DDPG_PARAMS): """DDPG model""" from stable_baselines3.ddpg.ddpg import DDPG # from stable_baselines3.ddpg.policies import DDPGPolicy from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] # param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG( 'MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], # param_noise=param_noise, action_noise=action_noise, verbose=model_params['verbose'], tensorboard_log=f"{zvt_env['log_path']}/{model_name}") model.learn(total_timesteps=model_params['timesteps'], tb_log_name="DDPG_run") end = time.time() model.save(f"{zvt_env['model_path']}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def create_model(env, algorithm, save_path): # the noise object n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions), theta=0.15) if algorithm == "ddpg": return DDPG(DDPG_MlpPolicy, env, learning_rate=0.001, buffer_size=1000000, batch_size=64, tau=0.001, gamma=0.99, train_freq=(10, "step"), action_noise=action_noise, policy_kwargs=dict(optimizer_class=th.optim.AdamW), tensorboard_log=save_path) elif algorithm == "td3": return TD3(TD3_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) elif algorithm == "sac": return SAC(SAC_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def sample_td3_params(trial): """ Sampler for TD3 hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical( 'buffer_size', [int(1e4), int(1e5), int(1e6)]) episodic = trial.suggest_categorical('episodic', [True, False]) if episodic: n_episodes_rollout = 1 train_freq, gradient_steps = -1, -1 else: train_freq = trial.suggest_categorical('train_freq', [1, 16, 128, 256, 1000, 2000]) gradient_steps = train_freq n_episodes_rollout = -1 noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', None]) noise_std = trial.suggest_uniform('noise_std', 0, 1) net_arch = trial.suggest_categorical('net_arch', ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { 'small': [64, 64], 'medium': [256, 256], 'big': [400, 300], }[net_arch] hyperparams = { 'gamma': gamma, 'learning_rate': learning_rate, 'batch_size': batch_size, 'buffer_size': buffer_size, 'train_freq': train_freq, 'gradient_steps': gradient_steps, 'n_episodes_rollout': n_episodes_rollout, 'policy_kwargs': dict(net_arch=net_arch), } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def _preprocess_action_noise(self, hyperparams: Dict[str, Any], saved_hyperparams: Dict[str, Any], env: VecEnv) -> Dict[str, Any]: # Parse noise string # Note: only off-policy algorithms are supported if hyperparams.get("noise_type") is not None: noise_type = hyperparams["noise_type"].strip() noise_std = hyperparams["noise_std"] # Save for later (hyperparameter optimization) self.n_actions = env.action_space.shape[0] if "normal" in noise_type: hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(self.n_actions), sigma=noise_std * np.ones(self.n_actions), ) elif "ornstein-uhlenbeck" in noise_type: hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(self.n_actions), sigma=noise_std * np.ones(self.n_actions), ) else: raise RuntimeError(f'Unknown noise type "{noise_type}"') print(f"Applying {noise_type} noise with std {noise_std}") del hyperparams["noise_type"] del hyperparams["noise_std"] return hyperparams
def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for DDPG hyperparams. :param trial: :return: """ gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]) buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)]) # Polyak coeff tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02]) episodic = trial.suggest_categorical("episodic", [True, False]) if episodic: n_episodes_rollout = 1 train_freq, gradient_steps = -1, -1 else: train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000]) gradient_steps = train_freq n_episodes_rollout = -1 noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) # NOTE: Add "verybig" to net_arch when tuning HER (see TD3) net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { "small": [64, 64], "medium": [256, 256], "big": [400, 300], }[net_arch] hyperparams = { "gamma": gamma, "tau": tau, "learning_rate": learning_rate, "batch_size": batch_size, "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, "n_episodes_rollout": n_episodes_rollout, "policy_kwargs": dict(net_arch=net_arch), } if noise_type == "normal": hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions) ) elif noise_type == "ornstein-uhlenbeck": hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions) ) return hyperparams
def main(): """ # Example with Vectorized env num_cpu = 4 # Number of processes to use my_env_kwargs={'renders': False} env = make_vec_env('panda-ip-reach-v0', n_envs=num_cpu, env_kwargs=my_env_kwargs) """ # Example with a simple Dummy vec env env = gym.envs.make('panda-ip-reach-v0', renders=False) env = DummyVecEnv([lambda: env]) #check_env(pandaenv) # The noise objects for DDPG n_actions = env.action_space.shape[-1] print("n_actions = {0}".format(n_actions)) #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG(policy='MlpPolicy', env=env, learning_rate=0.001, buffer_size=1000000, learning_starts=100, batch_size=100, tau=0.005, gamma=0.99, train_freq=1, gradient_steps=-1, action_noise=action_noise, optimize_memory_usage=False, tensorboard_log="./ddpg_panda_reach_tensorboard/", create_eval_env=False, policy_kwargs=None, verbose=1, seed=None, device='auto', _init_setup_model=True) """ print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=1) print("end model evaluation !") """ print("start model learning !") model.learn(total_timesteps=200000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("ddpg_panda_reach") """ print("start model evaluation with learning !") mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=1) print("end model evaluation !") """ """
def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for TD3 hyperparams. :param trial: :return: """ gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]) buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)]) # Polyak coeff tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08]) train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]) gradient_steps = train_freq noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) # NOTE: Add "verybig" to net_arch when tuning HER net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { "small": [64, 64], "medium": [256, 256], "big": [400, 300], # Uncomment for tuning HER # "verybig": [256, 256, 256], }[net_arch] hyperparams = { "gamma": gamma, "learning_rate": learning_rate, "batch_size": batch_size, "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, "policy_kwargs": dict(net_arch=net_arch), "tau": tau, } if noise_type == "normal": hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions) ) elif noise_type == "ornstein-uhlenbeck": hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions) ) if trial.using_her_replay_buffer: hyperparams = sample_her_params(trial, hyperparams) return hyperparams
def objective(trial): noise = trial.suggest_uniform('Noise', 0.1, 0.8) timesteps = trial.suggest_int('Timesteps', 10, 100) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(noise) * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise) model.learn(total_timesteps=timesteps * 1000, log_interval=1000) return test_model(env, model, '')
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # add the noise objects for DDPG n_actions = env_train.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end-start)/60,' minutes') return model
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # the noise objects for DDPG n_actions = env_train.action_space.shape[-1] # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() param_noise = None # removed keyword "param_noise=param_noise" stable_baselines3 doesn't need this one model = DDPG('MlpPolicy', env_train, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def action_noise(hyper, algo, n_actions): """ Configure Action Noise from hyperparameter logs """ if hyper['params_episodic']: hyper['params_train_freq'] = (1, "episode") else: hyper['params_train_freq'] = (int(hyper['params_train_freq']), "step") if hyper["params_noise_type"] == "normal": hyper["params_action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma=hyper['params_noise_std'] * np.ones(n_actions)) elif hyper["params_noise_type"] == "ornstein-uhlenbeck": hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=hyper['params_noise_std'] * np.ones(n_actions)) else: hyper["params_action_noise"] = None return hyper
def action_noise(hyper, algo, n_actions): """ Configure Action Noise from hyperparameter logs """ if hyper['params_episodic']: hyper['params_n_episodes_rollout'] = 1 hyper['params_train_freq'], hyper['params_gradient_steps'] = -1, -1 else: hyper['params_train_freq'] = hyper['params_train_freq'] hyper['params_gradient_steps'] = hyper['params_train_freq'] hyper['params_n_episodes_rollout'] = -1 if hyper["params_noise_type"] == "normal": hyper["params_action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions)) elif hyper["params_noise_type"] == "ornstein-uhlenbeck": hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions)) else: hyper["params_action_noise"] = None return hyper
def make_model(config, env): policy = config["policy_name"] if config["policy_name"] == "CustomTCNPolicy": policy = customActorCriticPolicyWrapper( env.observation_space.shape[0] // config["obs_input"], config["obs_input"]) tb_log = None if config["tensorboard_log"]: tb_log = "./tb/{}/".format(config["session_ID"]) ou_noise = None if config["ou_noise"]: ou_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(env.action_space.shape[0]), sigma=config["ou_sigma"] * np.ones(env.action_space.shape[0]), theta=config["ou_theta"], dt=config["ou_dt"], initial_noise=None) model = TD3(policy=policy, env=env, buffer_size=config["buffer_size"], learning_starts=config["learning_starts"], action_noise=ou_noise, target_policy_noise=config["target_policy_noise"], target_noise_clip=config["target_noise_clip"], gamma=config["gamma"], tau=config["tau"], learning_rate=eval(config["learning_rate"]), verbose=config["verbose"], tensorboard_log=tb_log, device="cpu", policy_kwargs=dict(net_arch=[ int(config["policy_hid_dim"]), int(config["policy_hid_dim"]) ])) return model
import numpy as np import pytest from stable_baselines3 import A2C, PPO, SAC, TD3 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) @pytest.mark.parametrize('action_noise', [normal_action_noise, OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1))]) def test_td3(action_noise): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500) @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0']) def test_a2c(env_id): model = A2C('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0']) @pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2]) def test_ppo(env_id, clip_range_vf): if clip_range_vf is not None and clip_range_vf < 0: # Should throw an error with pytest.raises(AssertionError): model = PPO('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True, clip_range_vf=clip_range_vf) else:
import gym import numpy as np import pytest from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3 from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) @pytest.mark.parametrize("model_class", [TD3, DDPG]) @pytest.mark.parametrize("action_noise", [ normal_action_noise, OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1)) ]) def test_deterministic_pg(model_class, action_noise): """ Test for DDPG and variants (TD3). """ model = model_class( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, buffer_size=250, action_noise=action_noise, ) model.learn(total_timesteps=300, eval_freq=250)
def run_model_stablebaseline(flow_params, num_cpus=1, rollout_size=50, num_steps=50, algorithm="ppo", exp_config=None): """Run the model for num_steps if provided. Parameters ---------- flow_params : dict flow-specific parameters num_cpus : int number of CPUs used during training rollout_size : int length of a single rollout num_steps : int total number of training steps The total rollout length is rollout_size. Returns ------- stable_baselines.* the trained model """ from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) if algorithm == "PPO": from stable_baselines3 import PPO train_model = PPO('MlpPolicy', env, verbose=1, n_steps=rollout_size) train_model.learn(total_timesteps=num_steps) print("Learning Process is Done.") return train_model elif algorithm == "DDPG": from stable_baselines3 import DDPG from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise import numpy as np if exp_config == 'singleagent_figure_eight': train_model = DDPG( 'MlpPolicy', env, verbose=1, n_episodes_rollout=rollout_size, learning_starts=3000, learning_rate=0.0001, action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(1), sigma=0.15 * np.ones(1), initial_noise=0.7 * np.ones(1)), tau=0.005, batch_size=128, tensorboard_log='tensorboard_ddpg', device='cuda', ) else: train_model = DDPG( 'MlpPolicy', env, verbose=1, n_episodes_rollout=rollout_size, learning_starts=1200, tensorboard_log='tensorboard_ddpg', learning_rate=0.0001, action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(1), sigma=0.15 * np.ones(1), initial_noise=0.7 * np.ones(1)), tau=0.005, batch_size=512, device='cpu', ) from tensorboard_baselines.callbacks_ddpg import TensorboardCallback train_model.learn( total_timesteps=num_steps, log_interval=2, eval_log_path='ddpg_log', eval_freq=2, eval_freq=10, #callback=[TensorboardCallback], ) print("Learning Process is Done.") return train_model
n_actions = env.action_space.shape[0] if 'normal' in noise_type: if 'lin' in noise_type: final_sigma = hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions) hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=final_sigma, max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) else: raise RuntimeError(f'Unknown noise type "{noise_type}"') print(f"Applying {noise_type} noise with std {noise_std}") del hyperparams['noise_type'] del hyperparams['noise_std'] if 'noise_std_final' in hyperparams: del hyperparams['noise_std_final'] if args.trained_agent.endswith('.zip') and os.path.isfile( args.trained_agent): # Continue training print("Loading pretrained agent") # Policy should not be changed del hyperparams['policy']
from stable_baselines3.common.evaluation import evaluate_policy from stable_baselines3.common.vec_env import VecNormalize import gym, gym_conservation env_id = "conservation-v6" #"fishing-v1" algo = "td3" outdir = "results" total_timesteps = 1500000 verbose = 0 seed = 0 tensorboard_log = "/var/log/tensorboard/single" log_dir = "logs" noise_std = 0.4805935357322933, action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1)) hyper = { "gamma": 0.995, "learning_rate": 8.315382409902049e-05, "batch_size": 512, "buffer_size": 10000, "train_freq": 1000, "gradient_steps": 1000, "n_episodes_rollout": -1, "action_noise": action_noise, "policy_kwargs": { "net_arch": [64, 64] } } #norm_env = VecNormalize(make_vec_env(env_id), gamma = hyper["gamma"])
f.close() # A2C algorithm for i in range(n_tests): test_name = 'saved_models/a2c_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] model = A2C('MlpPolicy', env) model.learn(total_timesteps=25000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) # DDPG algorithm for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) policy_kwargs = dict(net_arch=[400, 300]) model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name)
if hyper['episodic']: hyper['n_episodes_rollout'] = 1 hyper['train_freq'], hyper['gradient_steps'] = -1, -1 else: hyper['train_freq'] = hyper['train_freq'] hyper['gradient_steps'] = hyper['train_freq'] hyper['n_episodes_rollout'] = -1 n_actions = env.action_space.shape[0] if hyper["noise_type"] == "normal": hyper["action_noise"] = NormalActionNoise(mean=np.zeros(n_actions), sigma=hyper['noise_std'] * np.ones(n_actions)) elif noise_type == "ornstein-uhlenbeck": hyper["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=hyper['noise_std'] * np.ones(n_actions)) model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log, seed=seed, gamma=hyper['gamma'], learning_rate=hyper['lr'], batch_size=hyper['batch_size'], buffer_size=hyper['buffer_size'], action_noise=hyper['action_noise'], train_freq=hyper['train_freq'], gradient_steps=hyper['train_freq'], n_episodes_rollout=hyper['n_episodes_rollout'],
#### Create custom policy ########################################################################## CustomPolicy = MlpPolicy CustomPolicy.layers = [64, 64, 32] # actor network has layers [64, 64, 32] #### Check the environment's spaces ################################################################ env = RLTetherAviary(gui=False, record=False) env = Monitor(env, log_dir) print("[INFO] Action space:", env.action_space) print("[INFO] Observation space:", env.observation_space) print("[INFO] Checking Environment...") check_env(env, warn=True, skip_render_check=True) #### action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.N_ACTIONS), sigma=0.1 * np.ones(env.N_ACTIONS), dt=0.005) #### Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) #### Train the model ############################################################################### model = DDPG(CustomPolicy, env, verbose=1, batch_size=64, action_noise=action_noise) for i in range(step_iters): # run for step_iters * training_timesteps
from stable_baselines3 import TD3 from stable_baselines3.td3.policies import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines3.common.evaluation import evaluate_policy env = gym.make('Pendulum-v0') # check env #from stable_baselines3.common.env_checker import check_env #check_env(env) # The noise objects for TD3 n_actions = env.action_space.shape[-1] #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1) print("start model evaluation without learning !") mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100) print("end model evaluation !") print("start model learning !") model.learn(total_timesteps=10000, log_interval=10) print("end model learning !") print("-> model saved !!") model.save("td3_pendulum") print("start model evaluation with learning !")