def parse_noise_types(noise_type, nb_actions): """ Parse noise types for policies """ action_noise = None param_noise = None for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise, param_noise
def create_action_noise(env, noise_type): action_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) return action_noise
def run(self): self._init() env = self.env model = self.model objective = self.objective if objective == "infogain": wenv = InfogainEnv(env, model) elif objective == "prederr": wenv = PrederrEnv(env, model) else: raise AttributeError( "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'" .format(objective)) wenv.max_episode_len = self.horizon wenv.end_episode_callback = self._end_episode dvenv = DummyVecEnv([lambda: wenv]) if self.rl_algo == "ddpg": self.logger.info("Setting up DDPG as model-free RL algorithm.") pn = AdaptiveParamNoiseSpec() an = NormalActionNoise(np.array([0]), np.array([1])) rl_model = DDPG(DDPGMlpPolicy, dvenv, verbose=1, render=False, action_noise=an, param_noise=pn, nb_rollout_steps=self.horizon, nb_train_steps=self.horizon) elif self.rl_algo == "sac": self.logger.info("Setting up SAC as model-free RL algorithm.") rl_model = SAC(SACMlpPolicy, dvenv, verbose=1, learning_starts=self.horizon) else: raise AttributeError( "Model-free RL algorithm '{}' is unknown.".format( self.rl_algo)) # Train the agent max_steps_total = self.horizon * self.n_episodes * 100 try: self.logger.info("Start the agent") rl_model.learn(total_timesteps=max_steps_total, seed=self.seed) except MaxEpisodesReachedException: print("Exploration finished.")
def main(): # Save argument values to yaml file args_file_path = os.path.join(args.log_dir, 'args.yaml') with open(args_file_path, 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Create and wrap the environment env = gym.make(args.env) env = Monitor(env, args.log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # Add some param noise for exploration if args.model == 'DDPG': param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2) model = MODEL_CLASS(MlpPolicy, env, param_noise=param_noise, memory_limit=int(1e6), verbose=0) if args.model == 'SAC': # TODO: This doesn't work model = MODEL_CLASS(MlpPolicy, env, verbose=1, policy_kwargs={ 'n_env': 1, 'n_steps': 64, 'n_batch': 64 }) else: model = MODEL_CLASS(MlpPolicy, env, verbose=0) # Train the agent model.learn(total_timesteps=args.n_steps, callback=callback) # Save the final model if args.save_model: model_file_path = os.path.join(args.log_dir, 'model.pkl') model.save(model_file_path) print("Best and final models saved in ", os.path.abspath(args.log_dir)) if args.plots: raise NotImplementedError
def main(_algo_name, _algo_tag, _tag_suffix, _save_freq, _lock_rotation, _eval_num, _eval_freq, hyperparams): rotation_tag = "_LOCKED_ROT_" if _lock_rotation else "_ROTATION_" full_tag = _algo_name + rotation_tag + _algo_tag + _tag_suffix current_dir = _algo_name + "/" + full_tag log_dir = current_dir + "/log/" eval_log_dir = current_dir + "/log/eval/" trained_models_dir = current_dir + "/models/" os.makedirs(log_dir, exist_ok=True) os.makedirs(eval_log_dir, exist_ok=True) os.makedirs(trained_models_dir, exist_ok=True) is_discrete = True if _algo_name == 'DQN' else False panda_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete), log_dir)) eval_env = HERGoalEnvWrapper(CustomMonitor(get_environment(_lock_rotation=_lock_rotation, _is_discrete=is_discrete), eval_log_dir)) callbacks = [] callbacks.append(CheckpointCallback(_save_freq, trained_models_dir)) if _save_freq > 0 else None callbacks.append(MeanHundredEpsTensorboardCallback(log_dir)) callbacks.append(StdHundredEpsTensorboardCallback(log_dir)) callbacks.append(SuccessRateTensorboardCallback(log_dir)) if _algo_name == 'DDPG': callbacks.append(SaveOnBestTrainingRewardCallback(10000, log_dir)) else: callbacks.append(EvalCallback(eval_env, best_model_save_path=trained_models_dir, log_path=log_dir, eval_freq=_eval_freq, deterministic=True, render=False, n_eval_episodes=_eval_num)) if _eval_freq > 0 else None time_steps = hyperparams.pop('n_timesteps') if hyperparams.get('n_timesteps') is not None else None param_noise = None action_noise = None if hyperparams.get('noise_type') is not None: noise_type = hyperparams.pop('noise_type').strip() if 'ornstein-uhlenbeck' in noise_type: n_actions = panda_env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.005) * np.ones(n_actions)) elif 'param_noise' in noise_type: param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # add action noise for DDPG or TD3, in DQN noise as flag already in hyperparams if _algo_name == 'DDPG' or _algo_name == 'TD3': hyperparams['action_noise'] = action_noise # add hyperparams specific only for DDPG if _algo_name == 'DDPG': hyperparams['param_noise'] = param_noise hyperparams['eval_env'] = eval_env model = ALGOS[_algo_name](env=panda_env, tensorboard_log="tensorboard/", n_cpu_tf_sess=None, **hyperparams) model.learn(total_timesteps=time_steps, callback=callbacks, tb_log_name=full_tag, log_interval=10) model.save(current_dir + "/" + full_tag + "_final")
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec import numpy as np powerenv = ActiveEnv() powerenv.set_parameters({ 'state_space': ['sun', 'demand', 'imbalance'], 'reward_terms': ['voltage', 'current', 'imbalance'] }) powerenv = DummyVecEnv([lambda: powerenv]) action_mean = np.zeros(powerenv.action_space.shape) action_sigma = 0.3 * np.ones(powerenv.action_space.shape) action_noise = OrnsteinUhlenbeckActionNoise(mean=action_mean, sigma=action_sigma) param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.01) t_steps = 800000 logdir = 'C:\\Users\\vegar\\Dropbox\\Master\\logs' powermodel = DDPG( LnMlpPolicy, powerenv, verbose=2, action_noise=action_noise, gamma=0.99, #param_noise=param_noise, tensorboard_log=logdir, memory_limit=int(800000), nb_train_steps=50, nb_rollout_steps=100, critic_lr=0.001,
def train(self, args, callback, env_kwargs=None, train_kwargs=None): env = self.makeEnv(args, env_kwargs=env_kwargs) if train_kwargs is None: train_kwargs = {} # Parse noise_type action_noise = None param_noise = None n_actions = env.action_space.shape[-1] if args.noise_param: param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_param_sigma, desired_action_stddev=args.noise_param_sigma) if train_kwargs.get("noise_action", args.noise_action) == 'normal': action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=args.noise_action_sigma * np.ones(n_actions)) elif train_kwargs.get("noise_action", args.noise_action) == 'ou': action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=args.noise_action_sigma * np.ones(n_actions)) # filter the hyperparam, and set default values in case no hyperparam train_kwargs = {k: v for k, v in train_kwargs.items() if k not in ["noise_action_sigma", "noise_action"]} # get the associated policy for the architecture requested if args.srl_model == "raw_pixels": args.policy = "cnn" else: args.policy = "mlp" self.policy = args.policy self.ob_space = env.observation_space self.ac_space = env.action_space policy_fn = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[args.policy] param_kwargs = { "verbose": 1, "render_eval": False, "render": False, "reward_scale": 1., "param_noise": param_noise, "normalize_returns": False, "normalize_observations": (args.srl_model == "raw_pixels"), "critic_l2_reg": 1e-2, "actor_lr": 1e-4, "critic_lr": 1e-3, "action_noise": action_noise, "enable_popart": False, "gamma": 0.99, "clip_norm": None, "nb_train_steps": 100, "nb_rollout_steps": 100, "nb_eval_steps": 50, "batch_size": args.batch_size } self.model = self.model_class(policy_fn, env, **{**param_kwargs, **train_kwargs}) self.model.learn(total_timesteps=args.num_timesteps, seed=args.seed, callback=callback) env.close()
import gym import numpy as np from matplotlib import pyplot as plt from stable_baselines import TD3, DDPG from stable_baselines.ddpg.policies import MlpPolicy, LnMlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec env = gym.make('gym_squeeze:squeeze-v0') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01) action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) model = DDPG(LnMlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log="./ppo1_squeeze_tensorboard_1/", full_tensorboard_log=True) model.learn(total_timesteps=10000000) model.save("ddpg_squeeze") del model # remove to demonstrate saving and loading
def train_HER(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir, log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir + '/', allow_early_resets=True) policy = kwargs['policy'] algo_name = kwargs['algo_name'] n_timesteps = kwargs['n_timesteps'] noise_type = None if 'noise_type' in kwargs: noise_type = kwargs['noise_type'] del kwargs['noise_type'] # HER Available strategies (cf paper): future, final, episode, random goal_selection_strategy = kwargs['goal_selection_strategy'] n_sampled_goal = kwargs['n_sampled_goal'] del kwargs['policy'] del kwargs['algo_name'] del kwargs['n_timesteps'] del kwargs['goal_selection_strategy'] del kwargs['n_sampled_goal'] # Set agent algorithm agent = set_agent(algo_name) if not agent: print("invalid algorithm for HER") return # the noise objects nb_actions = env.action_space.shape[-1] param_noise = None action_noise = None if noise_type: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'adaptive-param' in current_noise_type and algo_name is 'ddpg': _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Create learning rate schedule for key in ['learning_rate', 'learning_rate_pi', 'cliprange']: if key in kwargs: if isinstance(kwargs[key], str): schedule, initial_value = kwargs[key].split('_') initial_value = float(initial_value) kwargs[key] = linear_schedule(initial_value) elif isinstance(kwargs[key], float): kwargs[key] = constfn(kwargs[key]) else: raise ValueError('Invalid valid for {}: {}'.format( key, kwargs[key])) kwargs['tensorboard_log'] = os.path.join(log_dir, 'tb') kwargs['full_tensorboard_log'] = False kwargs['seed'] = seed kwargs['action_noise'] = action_noise if algo_name is 'ddpg': kwargs['param_noise'] = param_noise if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed for key in ['policy', 'policy_kwargs']: if key in kwargs: del kwargs[key] model = HER.load(os.path.join(out_dir, 'final_model.pkl'), env=env, verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = HER(policy, env, agent, goal_selection_strategy=goal_selection_strategy, n_sampled_goal=n_sampled_goal, verbose=1, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
def train(env_id, num_timesteps, seed, model_path=None, images=False): """ Train PPO2 model for Mujoco environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. """ def make_env(): if images: env_out = GymWrapper( suite.make( "SawyerLift", use_object_obs=False, use_camera_obs=True, # do not use pixel observations has_offscreen_renderer= True, # not needed since not using pixel obs has_renderer=False, # make sure we can render to the screen camera_depth=True, reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth render_visual_mesh=False, ), keys=["image", "depth"], images=True, ) else: env_out = GymWrapper( suite.make( "SawyerLift", use_object_obs=True, use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=False, # make sure we can render to the screen camera_depth=False, reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth render_visual_mesh=False, ) #, keys=["image", "depth"], images=True, ) env_out.reward_range = None env_out.metadata = None env_out.spec = None env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out #env = make_env() if images: env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = CnnPolicy tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/" else: env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/" nb_actions = env.action_space.shape[-1] #model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, tensorboard_log=tblog) #model = TRPO(policy=policy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, # gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log=tblog, verbose=1) model = DDPG(policy=ddpgMlpPolicy, env=env, memory_policy=Memory, eval_env=None, param_noise=AdaptiveParamNoiseSpec(initial_stddev=0.2, desired_action_stddev=0.2), action_noise=OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions)), memory_limit=int(1e6), verbose=2, tensorboard_log=tblog) model.learn(total_timesteps=num_timesteps) env.close() if model_path: model.save(model_path) #tf_util.save_state(model_path) return model, env
def train_DDPG(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir, log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir + '/', allow_early_resets=True) policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] noise_type = kwargs['noise_type'] del kwargs['policy'] del kwargs['n_timesteps'] del kwargs['noise_type'] ''' Parameter space noise: injects randomness directly into the parameters of the agent, altering the types of decisions it makes such that they always fully depend on what the agent currently senses. ''' # the noise objects for DDPG nb_actions = env.action_space.shape[-1] param_noise = None action_noise = None if not noise_type is None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed del kwargs['policy'] model = DDPG.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = DDPG(policy, env, param_noise=param_noise, action_noise=action_noise, seed=seed, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
# environment # env = OsmoEnv() # env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: OsmoEnv()]) # parameters(for training) tau = 0.1 # update rate for target model gamma = 0.95 # discount rate for q value. # batch_size = NUMCONC*5+3 # size of batch batch_size = 10 alr = 0.003 # actor learning rate clr = 0.003 # critic learning rate # noise(to better exploration) n_actions = env.action_space.shape[-1] param_noise = AdaptiveParamNoiseSpec() # action_noise = None # param_noise = None action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # A gaussian action noise # model(DDPG) # Deep Deterministic Policy Gradient Algorithms. # DDPG is the combination of Nature DQN、Actor-Critic and DPG, it is designed to tackle continuous action space problems. # Policy-learning # The policy function(actor) takes state as input and is updated according to policy gradient. # Q-learning # The value function(critic) take state and action as input and is adjusted to minimize the loss. # Q-learning algorithm for function approximator is largely based on minimizing this MSBE loss function, with two main tricks, viz replay buffer and targrt network. # The replay buffer is used to store experience, because DDPG is an off-policy algorithm.