def action_noise(hyper, algo, n_actions): """ Configure Action Noise from hyperparameter logs """ if hyper['params_episodic']: hyper['params_train_freq'] = (1, "episode") else: hyper['params_train_freq'] = (int(hyper['params_train_freq']), "step") if hyper["params_noise_type"] == "normal": hyper["params_action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma=hyper['params_noise_std'] * np.ones(n_actions)) elif hyper["params_noise_type"] == "ornstein-uhlenbeck": hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=hyper['params_noise_std'] * np.ones(n_actions)) else: hyper["params_action_noise"] = None return hyper
def test_stable_DDPG(env_name, request): env = request.getfixturevalue(env_name) # DDPG must fail in discrete environments if env_name == 'env_demo': with pytest.raises(IndexError): env.action_space.shape[-1] with pytest.raises(AssertionError): model = stable_baselines3.DDPG("MlpPolicy", env, verbose=1) else: # Action noise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # model model = stable_baselines3.DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1) model.learn(total_timesteps=TIMESTEPS) # Check model state assert model.action_space == env.action_space assert model.env.action_space == env.action_space assert isinstance(model.policy, stable_baselines3.td3.policies.TD3Policy) # Check model works obs = env.reset() a, _ = model.predict(obs) obs, reward, done, info = env.step(a) assert reward is not None and reward < 0 assert a is not None assert isinstance(done, bool) assert info['timestep'] == 1 env.close()
def train_td3(): log_dir = f"model_save/" env = ENV_CONTINUE(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=1, batch_size=2048, seed=1) model = TD3('MlpPolicy', env, verbose=1, batch_size=2048, seed=1) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(100000), callback=callback, log_interval=100) model.save('model_save/td3')
def test_sac(ent_coef, i): model = SAC( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5e3, verbose=1, create_eval_env=True, buffer_size=1000000, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)) #, #tensorboard_log="./sac_pendulum_tensorboard/" ) eval_env = gym.make('Pendulum-v0') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=250, deterministic=True, render=False) model.learn(total_timesteps=20000, callback=eval_callback)
def test_continuous(model_class): env = IdentityEnvBox(eps=0.5) n_steps = { A2C: 3500, PPO: 3000, SAC: 700, TD3: 500, DDPG: 500 }[model_class] kwargs = dict(policy_kwargs=dict(net_arch=[64, 64]), seed=0, gamma=0.95) if model_class in [TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) kwargs["action_noise"] = action_noise model = model_class("MlpPolicy", env, **kwargs).learn(n_steps) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
def action_noise(hyper, algo, n_actions): """ Configure Action Noise from hyperparameter logs """ if hyper['params_episodic']: hyper['params_n_episodes_rollout'] = 1 hyper['params_train_freq'], hyper['params_gradient_steps'] = -1, -1 else: hyper['params_train_freq'] = hyper['params_train_freq'] hyper['params_gradient_steps'] = hyper['params_train_freq'] hyper['params_n_episodes_rollout'] = -1 if hyper["params_noise_type"] == "normal": hyper["params_action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions)) elif hyper["params_noise_type"] == "ornstein-uhlenbeck": hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions)) else: hyper["params_action_noise"] = None return hyper
def test_sac2(): reward = [] for i in [6000, 8000, 10000]: model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5000, verbose=0, create_eval_env=True, buffer_size=i, ent_coef=0, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), batch_size=32) eval_env = gym.make('Pendulum-v0') eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/alpha4_histogram', eval_freq=250, n_eval_episodes=5, deterministic=True, render=False) model.learn(total_timesteps=20000, callback=eval_callback) reward.append(eval_callback.last_mean_reward) hist, bins = np.histogram(model.replay_buffer.rewards, bins=500) x = [] for h in range(len(hist)): for j in range(hist[h]): x.append(bins[h]) plt.hist(x, bins=bins) plt.xlabel("reward") plt.ylabel("population") plt.title( "last mean reward = {:.2f} +/- {:.2f}, replay size = {}".format( reward[-1], eval_callback.last_std, i)) plt.legend() plt.show() return reward
def main(): # Create log dir log_dir = './sac_data' os.makedirs(log_dir, exist_ok=True) vix_env = trading_vix_env.trading_vix_env() env = Monitor(vix_env, log_dir) # Create action noise because TD3 and DDPG use a deterministic policy n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Create the callback: check every 20000 steps callback = custom_call_back.CustomCallback(check_freq=20000, log_dir=log_dir) # Create RL model model = SAC('MlpPolicy', env, action_noise=action_noise, verbose=2, batch_size=10000) # Train the agent model.learn(total_timesteps=int(5e9), callback=callback)
def test_goal_selection_strategy(goal_selection_strategy, online_sampling): """ Test different goal strategies. """ env = BitFlippingEnv(continuous=True) normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) model = HER( "MlpPolicy", env, SAC, goal_selection_strategy=goal_selection_strategy, online_sampling=online_sampling, gradient_steps=1, train_freq=4, max_episode_length=10, policy_kwargs=dict(net_arch=[64]), learning_starts=100, action_noise=normal_action_noise, ) assert model.action_noise is not None model.learn(total_timesteps=300)
def train_DDPG(env): print(f"action space shape -1:{env.action_space.shape[-1]}") # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.02 * np.ones(n_actions)) model = DDPG( 'MlpPolicy', env, learning_rate=0.0003, learning_starts=5, train_freq=10, n_episodes_rollout=-1, buffer_size=100000, action_noise=action_noise, batch_size=128, verbose=2, ) model.learn(total_timesteps=1000000, log_interval=1) model.save("DDPG_pkl")
mdp = OvercookedGridworld.from_layout_name("cramped_room_single") base_env = OvercookedEnv.from_mdp(mdp, horizon=1e4) env = gym.make('Overcooked-single-v0') env.custom_init(base_env, base_env.lossless_state_encoding_mdp_single) env = Monitor(env, "./her_overcooked/", allow_early_resets=True) # Available strategies (cf paper): future, final, episode # goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # If True the HER transitions will get sampled online online_sampling = True # Time limit for the episodes max_episode_length = 50 action_noise = NormalActionNoise(mean=np.zeros(1), sigma=0.3 * np.ones(1)) # Initialize the model model = HER( "MlpPolicy", env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper # we have to manually specify the max number of steps per episode max_episode_length=max_episode_length, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95,
policy_kwargs=onpolicy_kwargs, tensorboard_log=filename+'/tb/', verbose=1 ) if ARGS.obs == ObservationType.KIN else PPO(a2cppoCnnPolicy, train_env, policy_kwargs=onpolicy_kwargs, tensorboard_log=filename+'/tb/', verbose=1 ) #### Off-policy algorithms ################################# offpolicy_kwargs = dict(activation_fn=torch.nn.ReLU, # net_arch=[512, 512, 256, 128] net_arch=[400, 300] ) # or None # or dict(net_arch=dict(qf=[256, 128, 64, 32], pi=[256, 128, 64, 32])) action_noise = NormalActionNoise(np.array([0.0]), np.array([0.2])) if ARGS.algo == 'sac': model = SAC(sacMlpPolicy, train_env, policy_kwargs=offpolicy_kwargs, tensorboard_log=filename+'/tb/', verbose=1 ) if ARGS.obs == ObservationType.KIN else SAC(sacCnnPolicy, train_env, policy_kwargs=offpolicy_kwargs, tensorboard_log=filename+'/tb/', verbose=1 ) if ARGS.algo == 'td3': model = TD3(td3ddpgMlpPolicy,
def using_callback_example(): # Using Callback: Monitoring Training. class SaveOnBestTrainingRewardCallback(BaseCallback): """ Callback for saving a model (the check is done every 'check_freq' steps) based on the training reward (in practice, we recommend using 'EvalCallback'). :param check_freq: :param log_dir: Path to the folder where the model will be saved. It must contains the file created by the 'Monitor' wrapper. :param verbose: Verbosity level. """ def __init__(self, check_freq: int, log_dir: str, verbose: int = 1): super(SaveOnBestTrainingRewardCallback, self).__init__(verbose) self.check_freq = check_freq self.log_dir = log_dir self.save_path = os.path.join(log_dir, "best_model") self.best_mean_reward = -np.inf def _init_callback(self) -> None: # Create folder if needed. if self.save_path is not None: os.makedirs(self.save_path, exist_ok=True) def _on_step(self) -> bool: if self.n_calls % self.check_freq == 0: # Retrieve training reward. x, y = ts2xy(load_results(self.log_dir), "timesteps") if len(x) > 0: # Mean training reward over the last 100 episodes. mean_reward = np.mean(y[-100:]) if self.verbose > 0: print(f"Num timesteps: {self.num_timesteps}") print( f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}" ) # New best model, you could save the agent here. if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward # Example for saving best model. if self.verbose > 0: print(f"Saving new best model to {self.save_path}") self.model.save(self.save_path) return True # Create log dir. log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment. env = gym.make("LunarLanderContinuous-v2") env = Monitor(env, log_dir) # Add some action noise for exploration. n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Because we use parameter noise, we should use a MlpPolicy with layer normalization. model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0) # Create the callback: check every 1000 steps. callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent. timesteps = 1e5 model.learn(total_timesteps=int(timesteps), callback=callback) plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS, "TD3 LunarLander") plt.show()
from stable_baselines3.td3.policies import MlpPolicy from TD3_torch import TD3 from Config import Config arg = Config() import numpy as np from numpy import pi import time from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from reward_functions import reward_singleff from FireflyEnv import ffacc_real action_noise = NormalActionNoise(mean=0., sigma=float(0.3)) arg.init_action_noise = 0.5 arg.goal_distance_range = [0.3, 1] arg.mag_action_cost_range = [0.1, 1.] arg.dev_action_cost_range = [0.1, 1.] arg.dev_v_cost_range = [0.1, 1.] arg.dev_w_cost_range = [0.1, 1.] # arg.goal_distance_range=[0.01,0.99] arg.gains_range = [0.35, 0.45, pi / 2 - 0.1, pi / 2 + 0.1] # arg.goal_radius_range=[0.07,0.2] arg.std_range = [0.01, 0.07, 0.01, 0.07] # arg.mag_action_cost_range= [0.0001,0.0005] # arg.dev_action_cost_range= [0.0001,0.0005] arg.reward_amount = 100 arg.terminal_vel = 0.05 arg.dt = 0.1 arg.episode_len = 100 arg.training = True arg.presist_phi = False arg.agent_knows_phi = True
def train(experiment_name: str = typer.Option(...), total_timesteps: int = 3000000, input_path: Optional[str] = None, agent_type: SingleOrMultiAgent = SingleOrMultiAgent.single_agent, env_seed: int = random.randint(0, int(1e6)), environment_port: int = 5005, device: str = 'cuda', gamma: float = 0.99, learning_rate: float = 5e-5, policy_layers_comma_sep: str = '128,128,128', value_layers_comma_sep: str = '128,128,128', eval_freq: int = 100000, n_eval_episodes: int = 40, rl_algorithm: RLAlgorithm = RLAlgorithm.ppo, n_envs: Optional[int] = None, batch_size: Optional[int] = None, n_steps: Optional[int] = None, ppo_target_kl: Optional[float] = 0.1, ppo_a2c_gae_lambda: float = 0.95, ppo_n_epochs: int = 10, ppo_clip_range: float = 0.2, log_std_init: Optional[float] = None, ppo_a2c_ortho_init: Optional[bool] = None, td3_sac_buffer_size: Optional[int] = None, sac_tau: Optional[float] = None, sac_train_freq: Optional[int] = None, td3_sac_gradient_steps: Optional[int] = None, td3_sac_learning_starts: Optional[int] = None, td3_noise_type: Optional[str] = None, td3_noise_std: Optional[float] = None, use_sde: Optional[bool] = None, sde_sample_freq: Optional[int] = None, normalize=False, normalize_advantage: Optional[bool] = None, use_rms_prop: Optional[bool] = None, activation_function: Optional[str] = None): """Train an agent in the reacher environment. Args: experiment_name: the name of the experiment which will be used to create a directory under 'experiments' and store there all training artifacts along with the final and best models total_timesteps: the number of timestamps to run till stopping training input_path: in case provided, the model from that path is loaded - this is used to continue a previous training agent_type: specifies whether to use the environment with one agent or the environment with 20 agents env_seed: a seed for the environment random initialization - if not set, defaults to random environment_port: this is the port used by the unity environment to communicate with the C# backend. One needs to set different ports to different environments which run in parallel. device: the device used to train the model, can be 'cpu' or 'cuda:x' gamma: the discount rate applied to future actions learning_rate: the learning rate used by the policy and value network optimizer ppo_target_kl: an upper limit to the target KL divergence. This violates a bit the idea of PPO to reduce the amount of hyper-parameters but can still be useful since the agents can still experience catastrophic forgetting if this value becomes to high. The idea is to use it as a safe-guard, rather than a tunable hyper-parameter. policy_layers_comma_sep: a sequence of layer width for the policy network as a comma-separated list value_layers_comma_sep: a sequence of layer width for the value network as a comma-separated list eval_freq: the number of steps after which a validation round will take place. Whenever there is an improvement, the best model will be saved under the 'eval' directory in the experiment. Available only for the single agent environment. n_eval_episodes: number of episodes run during evaluation, available only for the single agent environment rl_algorithm: the algorithm used to train an agent n_envs: the number of agents used during training. This is applicable only in multi agent training and the maximum number of agents is 20. In fact all 20 agents of the unity environment will be active but only the first 'n_envs' will take active part in training. batch_size: the batch size used during training n_steps: number of steps run during rollout """ experiment_path = EXPERIMENTS_DIR / experiment_name model_path = experiment_path / 'model' eval_path = experiment_path / 'eval' tensorboard_log_path = experiment_path / 'tensorboard_logs' for path in [experiment_path, eval_path, tensorboard_log_path]: path.mkdir(exist_ok=True, parents=True) env = create_environment(agent_type=agent_type, normalize=normalize, n_envs=n_envs, env_seed=env_seed, environment_port=environment_port, training_mode=True) algorithm_class, policy = algorithm_and_policy[rl_algorithm] if input_path: model = algorithm_class.load(input_path, env=env) else: policy_layers = [ int(layer_width) for layer_width in policy_layers_comma_sep.split(',') ] value_layers = [ int(layer_width) for layer_width in value_layers_comma_sep.split(',') ] net_arch = (policy_layers if rl_algorithm in [ RLAlgorithm.td3, RLAlgorithm.sac ] else [dict(vf=value_layers, pi=policy_layers)]) policy_kwargs = remove_none_entries( dict(activation_fn=nn.ReLU if activation_function == 'ReLU' else None, net_arch=net_arch, log_std_init=log_std_init, ortho_init=ppo_a2c_ortho_init)) if rl_algorithm == RLAlgorithm.ppo: algorithm_specific_parameters = dict(target_kl=ppo_target_kl, gae_lambda=ppo_a2c_gae_lambda, n_epochs=ppo_n_epochs, clip_range=ppo_clip_range) elif rl_algorithm == RLAlgorithm.a2c: algorithm_specific_parameters = dict( normalize_advantage=normalize_advantage, use_rms_prop=use_rms_prop) elif rl_algorithm == RLAlgorithm.sac: algorithm_specific_parameters = dict( buffer_size=td3_sac_buffer_size, tau=sac_tau, train_freq=sac_train_freq, gradient_steps=td3_sac_gradient_steps, learning_starts=td3_sac_learning_starts) elif rl_algorithm == RLAlgorithm.td3: action_shape = (env.num_envs, env.action_space.shape[0]) action_noise = (NormalActionNoise( np.zeros(action_shape, dtype=np.float32), td3_noise_std * np.ones(action_shape, dtype=np.float32)) if td3_noise_type == 'normal' else None) algorithm_specific_parameters = remove_none_entries( dict(buffer_size=td3_sac_buffer_size, gradient_steps=td3_sac_gradient_steps, learning_starts=td3_sac_learning_starts, action_noise=action_noise)) else: algorithm_specific_parameters = dict() model_optional_parameters = remove_none_entries( dict(batch_size=batch_size, n_steps=n_steps, use_sde=use_sde, sde_sample_freq=sde_sample_freq)) model = algorithm_class( policy, env, verbose=1, tensorboard_log=str(tensorboard_log_path), device=device, gamma=gamma, policy_kwargs=policy_kwargs, learning_rate=learning_rate, **model_optional_parameters, **remove_none_entries(algorithm_specific_parameters)) eval_callback = ReacherEvaluationCallback(eval_env=env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, n_agents=n_envs if n_envs else 1, eval_path=eval_path, normalization=normalize) model.learn(total_timesteps=total_timesteps, callback=[eval_callback]) model.save(str(model_path / model)) model.get_vec_normalize_env().save(str(model_path / 'vecnormalize.pkl'))
def sample_td3_params(trial): """ Sampler for TD3 hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical( 'buffer_size', [int(1e4), int(1e5), int(1e6)]) episodic = trial.suggest_categorical('episodic', [True, False]) if episodic: n_episodes_rollout = 1 train_freq, gradient_steps = -1, -1 else: train_freq = trial.suggest_categorical('train_freq', [1, 16, 128, 256, 1000, 2000]) gradient_steps = train_freq n_episodes_rollout = -1 noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', None]) noise_std = trial.suggest_uniform('noise_std', 0, 1) net_arch = trial.suggest_categorical('net_arch', ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { 'small': [64, 64], 'medium': [256, 256], 'big': [400, 300], }[net_arch] hyperparams = { 'gamma': gamma, 'learning_rate': learning_rate, 'batch_size': batch_size, 'buffer_size': buffer_size, 'train_freq': train_freq, 'gradient_steps': gradient_steps, 'n_episodes_rollout': n_episodes_rollout, 'policy_kwargs': dict(net_arch=net_arch), } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def sample_td3_params(trial: optuna.Trial, octree_observations: bool = True, octree_depth: int = 4, octree_full_depth: int = 2, octree_channels_in: int = 7, octree_fast_conv: bool = True, octree_batch_norm: bool = True) -> Dict[str, Any]: """ Sampler for TD3 hyperparameters """ buffer_size = 150000 # learning_starts = trial.suggest_categorical( # "learning_starts", [5000, 10000, 20000]) learning_starts = 5000 batch_size = trial.suggest_categorical("batch_size", [32, 64, 128]) learning_rate = trial.suggest_float( "learning_rate", low=0.000001, high=0.001, log=True) gamma = trial.suggest_float("gamma", low=0.98, high=1.0, log=True) tau = trial.suggest_float("tau", low=0.001, high=0.025, log=True) target_policy_noise = trial.suggest_float( "target_policy_noise", low=0.0, high=0.5, log=True) target_noise_clip = 0.5 noise_std = trial.suggest_float("noise_std", low=0.025, high=0.5, log=True) action_noise = NormalActionNoise(mean=np.zeros(trial.n_actions), sigma=np.ones(trial.n_actions)*noise_std) train_freq = 1 gradient_steps = trial.suggest_categorical("gradient_steps", [1, 2]) policy_kwargs = dict() net_arch = trial.suggest_categorical("net_arch", ["small [256, 128]", "medium [384, 256]", "big [512, 384]"]) policy_kwargs["net_arch"] = {"small [256, 128]": [256, 128], "medium [384, 256]": [384, 256], "big [512, 384]": [512, 384]}[net_arch] if octree_observations: features_extractor_kwargs = dict() features_extractor_kwargs["depth"] = octree_depth features_extractor_kwargs["full_depth"] = octree_full_depth features_extractor_kwargs["channels_in"] = octree_channels_in features_extractor_kwargs["channel_multiplier"] = \ trial.suggest_categorical("channel_multiplier", [8, 16, 32, 64]) features_extractor_kwargs["features_dim"] = \ trial.suggest_categorical("features_dim", [256, 512, 768]) features_extractor_kwargs["fast_conv"] = octree_fast_conv features_extractor_kwargs["batch_normalization"] = octree_batch_norm policy_kwargs["features_extractor_kwargs"] = features_extractor_kwargs return { "buffer_size": buffer_size, "learning_starts": learning_starts, "batch_size": batch_size, "learning_rate": learning_rate, "gamma": gamma, "tau": tau, "target_policy_noise": target_policy_noise, "target_noise_clip": target_noise_clip, "action_noise": action_noise, "train_freq": train_freq, "gradient_steps": gradient_steps, "policy_kwargs": policy_kwargs, }
def sample_tqc_params(trial: optuna.Trial, octree_observations: bool = True, octree_depth: int = 4, octree_full_depth: int = 2, octree_channels_in: int = 7) -> Dict[str, Any]: """ Sampler for TQC hyperparameters """ buffer_size = 25000 learning_starts = 0 batch_size = 32 learning_rate = trial.suggest_float("learning_rate", low=0.000025, high=0.00075, log=True) gamma = 1.0 - trial.suggest_float("gamma", low=0.0001, high=0.025, log=True) tau = trial.suggest_float("tau", low=0.0005, high=0.025, log=True) ent_coef = "auto_0.1_0.05" target_entropy = "auto" noise_std = trial.suggest_float("noise_std", low=0.01, high=0.1, log=True) action_noise = NormalActionNoise(mean=np.zeros(trial.n_actions), sigma=np.ones(trial.n_actions)*noise_std) train_freq = 1 gradient_steps = trial.suggest_categorical("gradient_steps", [1, 2]) policy_kwargs = dict() net_arch = trial.suggest_categorical("net_arch", [128, 256, 384, 512]) policy_kwargs["net_arch"] = [net_arch] * 2 policy_kwargs["n_quantiles"] = trial.suggest_int("n_quantiles", low=20, high=40) top_quantiles_to_drop_per_net = round(0.08*policy_kwargs["n_quantiles"]) policy_kwargs["n_critics"] = trial.suggest_categorical("n_critics", [2, 3]) if octree_observations: features_extractor_kwargs = dict() features_extractor_kwargs["depth"] = octree_depth features_extractor_kwargs["full_depth"] = octree_full_depth features_extractor_kwargs["channels_in"] = octree_channels_in features_extractor_kwargs["channel_multiplier"] = \ trial.suggest_categorical("channel_multiplier", [8, 16, 32]) features_extractor_kwargs["full_depth_channels"] = \ trial.suggest_categorical("full_depth_channels", [4, 8, 16]) features_extractor_kwargs["features_dim"] = \ trial.suggest_categorical("features_dim", [64, 128, 256]) features_extractor_kwargs["batch_normalization"] = trial.suggest_categorical("batch_normalization", [True, False]) policy_kwargs["features_extractor_kwargs"] = features_extractor_kwargs return { "buffer_size": buffer_size, "learning_starts": learning_starts, "batch_size": batch_size, "learning_rate": learning_rate, "gamma": gamma, "tau": tau, "ent_coef": ent_coef, "target_entropy": target_entropy, "top_quantiles_to_drop_per_net": top_quantiles_to_drop_per_net, "action_noise": action_noise, "train_freq": train_freq, "gradient_steps": gradient_steps, "policy_kwargs": policy_kwargs, }
def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for DDPG hyperparams. :param trial: :return: """ gamma = trial.suggest_categorical( "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) batch_size = trial.suggest_categorical( "batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]) buffer_size = trial.suggest_categorical( "buffer_size", [int(1e4), int(1e5), int(1e6)]) # Polyak coeff tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08]) train_freq = trial.suggest_categorical( "train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]) gradient_steps = train_freq noise_type = trial.suggest_categorical( "noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) # NOTE: Add "verybig" to net_arch when tuning HER (see TD3) net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { "small": [64, 64], "medium": [256, 256], "big": [400, 300], }[net_arch] hyperparams = { "gamma": gamma, "tau": tau, "learning_rate": learning_rate, "batch_size": batch_size, "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, "policy_kwargs": dict(net_arch=net_arch), } if noise_type == "normal": hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == "ornstein-uhlenbeck": hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) if trial.using_her_replay_buffer: hyperparams = sample_her_params(trial, hyperparams) return hyperparams
def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]: """ Sampler for DDPG hyperparams. :param trial: :return: """ gamma = trial.suggest_categorical( "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform("lr", 1e-5, 1) batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical( "buffer_size", [int(1e4), int(1e5), int(1e6)]) # Polyak coeff tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02]) episodic = trial.suggest_categorical("episodic", [True, False]) if episodic: n_episodes_rollout = 1 train_freq, gradient_steps = -1, -1 else: train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000]) gradient_steps = train_freq n_episodes_rollout = -1 noise_type = trial.suggest_categorical( "noise_type", ["ornstein-uhlenbeck", "normal", None]) noise_std = trial.suggest_uniform("noise_std", 0, 1) net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"]) # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU]) net_arch = { "small": [64, 64], "medium": [256, 256], "big": [400, 300], }[net_arch] hyperparams = { "gamma": gamma, "tau": tau, "learning_rate": learning_rate, "batch_size": batch_size, "buffer_size": buffer_size, "train_freq": train_freq, "gradient_steps": gradient_steps, "n_episodes_rollout": n_episodes_rollout, "policy_kwargs": dict(net_arch=net_arch), } if noise_type == "normal": hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == "ornstein-uhlenbeck": hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
# print(self.model) return True if __name__ == '__main__': # Instantiate Environment env_id = 'gym_spm:spm-v0' env = gym.make('gym_spm:spm-v0') # HyperParameters lr = 3e-4 # Instantiate Model n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=-30 * np.zeros(n_actions), sigma=.75 * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1) # model = PPO('MlpPolicy', env, tensorboard_log=log_dir) # Train OR Load Model model.learn(total_timesteps=25000) # model.save(model_dir_description) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = []
import numpy as np import pytest from stable_baselines3 import A2C, PPO, SAC, TD3 from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1)) @pytest.mark.parametrize('action_noise', [normal_action_noise, OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1))]) def test_td3(action_noise): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500) @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0']) def test_a2c(env_id): model = A2C('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True) model.learn(total_timesteps=1000, eval_freq=500) @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0']) @pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2]) def test_ppo(env_id, clip_range_vf): if clip_range_vf is not None and clip_range_vf < 0: # Should throw an error with pytest.raises(AssertionError): model = PPO('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True, clip_range_vf=clip_range_vf) else:
test_model(env, model, test_name) for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i) n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) policy_kwargs = dict(net_arch=[400, 300]) model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) for i in range(n_tests): test_name = 'saved_models/ddpg_soccer_actions_env_3_' + str(i) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions)) model = DDPG('MlpPolicy', env, action_noise=action_noise) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) # PPO algorithm for i in range(n_tests): test_name = 'saved_models/ppo_soccer_actions_env_1_' + str(i) n_actions = env.action_space.shape[-1] model = PPO('MlpPolicy', env) model.learn(total_timesteps=10000, log_interval=1000) model.save(test_name) test_model(env, model, test_name) # SAC algorithm
def train(experiment_name: str = typer.Option(...), total_timesteps: int = int(5e5), env_seed: int = random.randint(0, int(1e6)), port: int = 6005, device: str = 'cuda', gamma: float = 0.98, learning_rate: float = 7.3e-4, layers_comma_sep: str = '400,300', eval_freq: int = 100000, n_eval_episodes: int = 5, rl_algorithm: RLAlgorithm = RLAlgorithm.sac, batch_size: int = 256, buffer_size: int = 300000, gradient_steps: int = 64, learning_starts: int = 10000, sac_tau: float = 0.02, sac_train_freq: int = 64, td3_noise_type: Optional[str] = None, td3_noise_std: Optional[float] = None): """Train two agent in the tennis environment. Training is using single agent algorithms to train both agents with the union of their observations. Args: experiment_name: the name of the experiment which will be used to create a directory under 'experiments' and store there all training artifacts along with the final and best models total_timesteps: the number of timestamps to run till stopping training env_seed: a seed for the environment random initialization - if not set, defaults to random port: this is the port used by the unity environment to communicate with the C# backend. One needs to set different ports to different environments which run in parallel. device: the device used to train the model, can be 'cpu' or 'cuda:x' gamma: the discount rate applied to future actions learning_rate: the learning rate used by the policy and value network optimizer layers_comma_sep: a sequence of layer width for the networks as a comma-separated list eval_freq: the number of steps after which a validation round will take place. Whenever there is an improvement, the best model will be saved under the 'eval' directory in the experiment. Available only for the single agent environment. n_eval_episodes: number of episodes run during evaluation, available only for the single agent environment rl_algorithm: the algorithm used to train an agent batch_size: the batch size used during training """ experiment_path = EXPERIMENTS_DIR / experiment_name model_path = experiment_path / 'model' eval_path = experiment_path / 'eval' tensorboard_log_path = experiment_path / 'tensorboard_logs' for path in [experiment_path, eval_path, tensorboard_log_path]: path.mkdir(exist_ok=True, parents=True) environment_parameters = dict(seed=env_seed, no_graphics=True, train_mode=True, port=port) env = UnityEnvironmentWrapperToGym(**environment_parameters) algorithm_class, policy = algorithm_and_policy[rl_algorithm] layers = [int(layer_width) for layer_width in layers_comma_sep.split(',')] policy_kwargs = remove_none_entries( dict(activation_fn=nn.ReLU, net_arch=layers)) if rl_algorithm == RLAlgorithm.sac: algorithm_specific_parameters = dict(buffer_size=buffer_size, tau=sac_tau, train_freq=sac_train_freq, gradient_steps=gradient_steps, learning_starts=learning_starts) elif rl_algorithm == RLAlgorithm.td3: action_shape = (env.num_envs, env.action_space.shape[0]) action_noise = (NormalActionNoise( np.zeros(action_shape, dtype=np.float32), td3_noise_std * np.ones(action_shape, dtype=np.float32)) if td3_noise_type == 'normal' else None) algorithm_specific_parameters = remove_none_entries( dict(buffer_size=buffer_size, gradient_steps=gradient_steps, learning_starts=learning_starts, action_noise=action_noise)) else: raise ValueError(f'Unknown algorithm: {rl_algorithm}') model = algorithm_class( policy, env, verbose=1, tensorboard_log=str(tensorboard_log_path), device=device, gamma=gamma, policy_kwargs=policy_kwargs, learning_rate=learning_rate, batch_size=batch_size, **remove_none_entries(algorithm_specific_parameters)) model.learn(total_timesteps=total_timesteps, eval_env=env, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes, eval_log_path=str(eval_path)) model.save(str(model_path))
] and hyperparams.get('noise_type') is not None: noise_type = hyperparams['noise_type'].strip() noise_std = hyperparams['noise_std'] n_actions = env.action_space.shape[0] if 'normal' in noise_type: if 'lin' in noise_type: final_sigma = hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions) hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=final_sigma, max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) else: raise RuntimeError(f'Unknown noise type "{noise_type}"') print(f"Applying {noise_type} noise with std {noise_std}") del hyperparams['noise_type'] del hyperparams['noise_std'] if 'noise_std_final' in hyperparams: del hyperparams['noise_std_final'] if args.trained_agent.endswith('.zip') and os.path.isfile( args.trained_agent): # Continue training
def test_sac_phase(): reward = [] for i in [2000, 4000, 6000, 8000, 10000]: model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5000, verbose=0, create_eval_env=True, buffer_size=i, ent_coef=0, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), batch_size=32) env = model.env eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/alpha5_phase', eval_freq=250, n_eval_episodes=100, deterministic=True, render=False) model.learn(total_timesteps=20000, callback=eval_callback) reward.append(eval_callback.last_mean_reward) definition = 200 portrait = np.zeros((definition, definition)) state_min = env.observation_space.low state_max = env.observation_space.high for index_t, t in enumerate(np.linspace(-np.pi, np.pi, num=definition)): for index_td, td in enumerate( np.linspace(state_min[2], state_max[2], num=definition)): state = torch.Tensor([[np.cos(t), np.sin(t), td]]) action = model.policy.forward(state) portrait[definition - (1 + index_td), index_t] = model.critic.q1_forward(state, action) plt.figure(figsize=(10, 10)) plt.imshow(portrait, cmap="inferno", extent=[-180, 180, state_min[2], state_max[2]], aspect='auto') plt.rc('axes', titlesize=12) plt.xlabel('angle') plt.ylabel('velocity') plt.title( "critic, last mean reward = {:.2f} +/- {:.2f}, replay size = {}". format(reward[-1], eval_callback.last_std, i)) plt.colorbar(label="critic value") plt.scatter([0], [0]) plt.show() definition = 200 portrait = np.zeros((definition, definition)) state_min = env.observation_space.low state_max = env.observation_space.high portrait = np.zeros((definition, definition)) for index_t, t in enumerate(np.linspace(-np.pi, np.pi, num=definition)): for index_td, td in enumerate( np.linspace(state_min[2], state_max[2], num=definition)): state = torch.Tensor([[np.cos(t), np.sin(t), td]]) probs = model.policy.forward(state) action = probs.data.numpy().astype(float) portrait[definition - (1 + index_td), index_t] = action plt.figure(figsize=(10, 10)) plt.imshow(portrait, cmap="coolwarm", extent=[-180, 180, state_min[2], state_max[2]], aspect='auto') plt.title( "action, last mean reward = {:.2f} +/- {:.2f}, replay size = {}". format(reward[-1], eval_callback.last_std, i)) plt.colorbar(label="action") plt.rc('axes', titlesize=12) plt.xlabel('angle') plt.ylabel('velocity') plt.scatter([0], [0]) plt.show() return reward
from torchvision import transforms import numpy as np from model import BehaviorCloneNet, CarModel from logloader import LogLoader import time from torchvision.transforms import Compose, ToTensor, Normalize from custom_arch import CustomCNN, CustomActorCriticPolicy env = make_vec_env(DeepwatchEnv2) policy_kwargs = dict(features_extractor_class=CustomCNN) #check_env(env) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) #model = TD3(CnnPolicy, env, action_noise=action_noise, buffer_size=50000, verbose=1) # optimize_memory_usage=True #model = SAC(CnnPolicy, env, buffer_size=50000, action_noise=action_noise, learning_rate=0.0005, tensorboard_log='./tensorboard', verbose=1) #model = SAC.load("deepwatch_evolution_sac_7", env) model = A2C(MlpPolicy, env, verbose=1, n_steps=5) #, policy_kwargs=policy_kwargs) model.load("deepwatch_evolution_a2c_2") for i in range(100): model.learn(total_timesteps=1000) model.save("deepwatch_evolution_a2c_3") print("Saved Checkpoint") #model.learn(total_timesteps=10000) #model.save("deepwatch_evolution")
'noise_std': 0.513787888663763, 'net_arch': 'medium' } policy_kwargs = dict(net_arch=[256, 256]) # medium if hyper['episodic']: hyper['n_episodes_rollout'] = 1 hyper['train_freq'], hyper['gradient_steps'] = -1, -1 else: hyper['train_freq'] = hyper['train_freq'] hyper['gradient_steps'] = hyper['train_freq'] hyper['n_episodes_rollout'] = -1 n_actions = env.action_space.shape[0] if hyper["noise_type"] == "normal": hyper["action_noise"] = NormalActionNoise(mean=np.zeros(n_actions), sigma=hyper['noise_std'] * np.ones(n_actions)) elif noise_type == "ornstein-uhlenbeck": hyper["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=hyper['noise_std'] * np.ones(n_actions)) model = DDPG('MlpPolicy', env, verbose=0, tensorboard_log=tensorboard_log, seed=seed, gamma=hyper['gamma'], learning_rate=hyper['lr'], batch_size=hyper['batch_size'], buffer_size=hyper['buffer_size'],
def test_sac(ent_coef): model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))) model.learn(total_timesteps=1000, eval_freq=500)
if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) env = PortfolioHedgingEnv(use_skew=False, hedger_verbose=False, corr=0.0, instr_weight=0.5, save_figs=True) env.model_name = "sac_autohedger_portfolio_common_c_0_w_05" policy_args = {"net_arch": [8000, 8000]} reward_history = [] noise = NormalActionNoise(0, 50) model = SAC(MlpPolicy, env, verbose=2, learning_rate=5e-6, target_update_interval=32, learning_starts=0, use_sde_at_warmup=True, use_sde=False, policy_kwargs=policy_args, buffer_size=int(10e6)) model.learn(total_timesteps=20000, log_interval=50, n_eval_episodes=100, callback=callback) #model.save(env.model_name)