def run_agent(envs, parameters): '''Train an agent.''' path = Path(parameters['path']) dummy_env = OptVecEnv(envs) set_global_seeds(parameters.setdefault('seed')) save_path = str(path / 'model.pkl') alg = parameters['alg'] if alg == 'PPO': with open(save_path, 'rb') as pkl: model = PPO2.load(pkl, env=dummy_env) elif alg == 'A2C': with open(save_path, 'rb') as pkl: model = A2C.load(pkl, env=dummy_env) try: done = False observations = dummy_env.reset() while not done: action = model.predict(observations) print(action[0].ravel().tolist()) observations, rewards, dones, infos = dummy_env.step(action[0]) done = any(dones) info = infos[0] yield info['weights'] finally: dummy_env.close()
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', ), allow_early_resets=allow_early_resets) env.seed(seed) return env
def run_agent(envs, parameters): '''Train an agent.''' alg = parameters['alg'] learning_rate = parameters['learning_rate'] gamma = parameters['gamma'] model_path = parameters['model_path'] set_global_seeds(parameters.get('seed')) dummy_env = OptVecEnv(envs) if alg == 'PPO': model = PPO2(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1, nminibatches=dummy_env.num_envs) elif alg == 'A2C': model = A2C(MlpPolicy, dummy_env, gamma=gamma, learning_rate=learning_rate, verbose=1) else: model = DDPG(ddpg.MlpPolicy, dummy_env, gamma=gamma, verbose=1, actor_lr=learning_rate / 10, critic_lr=learning_rate) try: model.learn(total_timesteps=parameters.get('total_timesteps', 10**6)) except tf.errors.InvalidArgumentError: LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma))) finally: dummy_env.close() model.save(str(model_path))
def make_mario_env(env_id, num_env, seed, actions=None, cut_map=False, do_wrap_dm=True, wrapper_kwargs=None, start_index=0, allow_early_resets=True, start_method=None, use_subprocess=False): # FIXME do actions set up on env """ Create a wrapped, monitored VecEnv for Atari. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the initial seed for RNG :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function :param start_index: (int) start rank index :param allow_early_resets: (bool) allows early reset of the environment :param start_method: (str) method used to start the subprocesses. See SubprocVecEnv doc for more information :param use_subprocess: (bool) Whether to use `SubprocVecEnv` or `DummyVecEnv` when `num_env` > 1, `DummyVecEnv` is usually faster. Default: False :return: (VecEnv) The atari environment """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): def _thunk(): env = make_mario(env_id) env.seed(seed + rank) if cut_map: env = CutMarioMap(env) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) # FIXME do if wrap deepmind, create other methods return wrap_deepmind_custom( env, **wrapper_kwargs) # converts to 84*84 bw, keep for now return _thunk set_global_seeds(seed) # When using one environment, no need to start subprocesses if num_env == 1 or not use_subprocess: return DummyVecEnv([make_env(i + start_index) for i in range(num_env)]) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)], start_method=start_method)
def make_env(rank, seed=0, sub_id=6, enable_draw=True): def _init(): env = SimpleHumanoidMimicEnv(sub_id=sub_id, enable_draw=enable_draw) # Important: use a different seed for each environment env.seed(seed + rank) return env set_global_seeds(seed) return _init
def train_ppo(env_id, num_timesteps, seed, policy, save_params, n_envs=1, nminibatches=5, n_steps=8000): """ env_id: typr str, identifies each environment uniquely num_timesteps: number of timesteps to run the algorithm seed: initial random seed policy: policy to be followed (mlp, cnn, lstm, etc) n_env: number of envs to run in parallel nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params n_steps: number of steps in each update """ # Train PPO algorithm for num_timesteps # stack the frames for the vectorized environment # Note: PPO2 works only with vectorized environment set_global_seeds(seed) env = make_atari(env_id) env.seed(seed) env = Monitor(env, log_dir, allow_early_resets=True) env = wrap_deepmind(env, frame_stack=True) # define the policy policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # create model object for class PPO2 model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) # train the model # trained for 2e7 timesteps with seed = 5 model.learn(total_timesteps=num_timesteps, callback=callback) # save the hyperparameters and weights model.save(save_params) env.close() # free the memory del model
def make_mujoco_env(env_id, seed, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The mujoco environment """ set_global_seeds(seed + 10000 * mpi_rank_or_zero()) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), '0'), allow_early_resets=allow_early_resets) env.seed(seed) return env
def make_envs(env_id, do_eval, seed, conf, normalize_observations=False, normalize_returns=False): # Create envs. env_params = conf.pop('env_params', {}) env = base_env = gym.make(env_id) if hasattr(base_env, 'env'): base_env = base_env.env for attr in env_params: setattr(base_env, attr, env_params[attr]) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if normalize_observations or normalize_returns: env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=normalize_observations, norm_reward=normalize_returns) if do_eval: eval_env = base_eval_env = gym.make(env_id) if hasattr(base_eval_env, 'env'): base_eval_env = base_eval_env.env for attr in env_params: setattr(base_eval_env, attr, env_params[attr]) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'), allow_early_resets=True) eval_env.seed(seed) eval_env.base_env = base_eval_env else: base_eval_env = None eval_env = None env.base_env = base_env return base_env, env, base_eval_env, eval_env
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=True, start_method=None): """ Create a wrapped, monitored SubprocVecEnv for Atari. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function :param start_index: (int) start rank index :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The atari environment :param start_method: (str) method used to start the subprocesses. See SubprocVecEnv doc for more information """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) # When using one environment, no need to start subprocesses if num_env == 1: return DummyVecEnv([make_env(0)]) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)], start_method=start_method)
def train_trpo(env_id, num_timesteps, seed): # env_id: typr str, identifies each environment uniquely # num_timesteps: number of timesteps to run the algorithm # seed: initial random seed # set up the environment rank = MPI.COMM_WORLD.Get_rank() sseed = seed + 10000 * rank set_global_seeds(sseed) env = make_atari(env_id) env.seed(sseed) env = wrap_deepmind(make_atari(env_id)) env.seed(sseed) # define policies policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # define TRPO class object model = TRPO(policy=policy, env=env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_dampling=1e-3, ent_coef=0.0, gamma=0.99, lam=1, vf_iters=3, vf_stepsize=1e-4, verbose=1) # Train TRPO for num_timesteps model.learn(total_timesteps=num_timesteps) # save the hyperparameters and weights model.save('trpo' + env_id) env.close() # free the memory del model
def set_random_seed(self, seed): """ :param seed: (int) Seed for the pseudo-random generators. If None, do not change the seeds. """ # Ignore if the seed is None if seed is None: return # Seed python, numpy and tf random generator set_global_seeds(seed) if self.env is not None: if isinstance(self.env, VecEnv): # Use a different seed for each env for idx in range(self.env.num_envs): self.env.env_method("seed", seed + idx) else: self.env.seed(seed) # Seed the action space # useful when selecting random actions self.env.action_space.seed(seed) self.action_space.seed(seed)
def train_dqn_adv(env_id, train_timesteps, seed, policy, save_params, n_envs = 1): set_global_seeds(seed) env = make_atari(env_id) env.seed(seed) env = Monitor(env, log_dir, allow_early_resets=True) env = wrap_deepmind(env, frame_stack=True) # define the policy policy = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[policy] # create model object for class DQN model = DQN(policy = policy, env = env, gamma=0.99, learning_rate=0.0001, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, exploration_initial_eps=1.0, train_freq=4, batch_size=32, double_q=True, learning_starts=10000, target_network_update_freq=1000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, param_noise=False, n_cpu_tf_sess=None, verbose=1) callback = save_best_model_callback(save_freq = 100, log_dir = log_dir, save_params = save_params, verbose=1) # train the model # trained for 2e7 timesteps with seed = 7 model.learn(total_timesteps = train_timesteps, callback = callback) plot_results([log_dir], train_timesteps, results_plotter.X_TIMESTEPS, "DQNPong_TrainedByAdversary") plt.show() env.close() # free the memory del model
def make_rosetta_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=True, start_method=None, use_subprocess=False): """ Create a wrapped, monitored VecEnv for Rosetta. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the initial seed for RNG :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function :param start_index: (int) start rank index :param allow_early_resets: (bool) allows early reset of the environment :param start_method: (str) method used to start the subprocesses. See SubprocVecEnv doc for more information :param use_subprocess: (bool) Whether to use `SubprocVecEnv` or `DummyVecEnv` when `num_env` > 1, `DummyVecEnv` is usually faster. Default: False :return: (VecEnv) The atari environment """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets, reset_keywords=()) # return wrap_deepmind(env, **wrapper_kwargs) return env return _thunk set_global_seeds(seed) # When using one environment, no need to start subprocesses if num_env == 1 or not use_subprocess: return DummyVecEnv([make_env(i + start_index) for i in range(num_env)]) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)], start_method=start_method)
def run_agent(envs, parameters, trial): '''Train an agent.''' path = Path(parameters['path']) dummy_env = OptVecEnv(envs) set_global_seeds(parameters.setdefault('seed')) if parameters['alg'] == 'PPO': model = PPO2(MlpPolicy, dummy_env, gamma=parameters['gamma'], learning_rate=parameters['learning_rate'], verbose=0) elif parameters['alg'] == 'A2C': model = A2C(MlpPolicy, dummy_env, gamma=parameters['gamma'], learning_rate=parameters['learning_rate'], verbose=0) try: timesteps = parameters['total_timesteps'] * dummy_env.agent_no_list[0] with tqdm(count(), leave=True) as progress: progress = iter(progress) def callback(local_vars, global_vars): if next(progress) % 100: callback_env = local_vars['self'].env get_total_reward(callback_env) trial.report(get_total_reward(callback_env), local_vars['update']) if trial.should_prune(): raise optuna.structs.TrialPruned() model.learn(total_timesteps=timesteps, callback=callback) return get_total_reward(dummy_env) finally: dummy_env.close() model.save(str(path / 'model.pkl'))
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def run(env_id, seed, layer_norm, evaluation, agent, delay_step, gamma=0.99, **kwargs): # Create envs. env = create_env(env_id, delay_step, str(0)) print(env.observation_space, env.action_space) if evaluation: eval_env = create_env(env_id, delay_step, "eval_env") else: eval_env = None # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = time.time() policy = 'MlpPolicy' td3_variants = { "TD3": TD3, "TD3SIL": TD3SIL, "TD3NSTEP": TD3NSTEP, "TD3REDQ": TD3REDQ, "TD3DoubleTwin": TD3DoubleTwin, } if td3_variants.get(agent, None): model_func = td3_variants[agent] model = model_func(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, policy_kwargs={"layers": [400, 300]}) elif agent == "DDPG": model = DDPG(policy=policy, env=env, eval_env=eval_env, gamma=gamma, nb_eval_steps=5, batch_size=100, nb_train_steps=100, nb_rollout_steps=100, learning_starts=10000, actor_lr=1e-3, critic_lr=1e-3, critic_l2_reg=0, tau=0.005, normalize_observations=False, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2, n_cpu_tf_sess=10, policy_kwargs={"layers": [400, 300]}) elif agent == "SAC": model = SAC(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=256, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2, n_cpu_tf_sess=10, learning_starts=10000, policy_kwargs={"layers": [256, 256]}) elif agent == "GEM": policy = 'TD3LnMlpPolicy' model = TD3MemGEM(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, alpha=0.5, beta=-1, iterative_q=-1, num_q=4, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10, policy_kwargs={"layers": [400, 300]}) elif agent == "BP": policy = 'TD3LnMlpPolicy' model = TD3MemBackProp(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128, tau=0.005, policy_delay=2, learning_starts=25000, action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2, n_cpu_tf_sess=10, alpha=0.5, beta=-1, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10, policy_kwargs={"layers": [400, 300]}) else: raise NotImplementedError print("model building finished") model.learn(total_timesteps=kwargs['num_timesteps']) env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))