def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs))
def optimize_agent(self, trial): #self.env_params = self.optimize_envs(trial) env_id = "default" num_e = 1 # Number of processes to use self.train_env = DummyVecEnv([lambda: Template_Gym(eval=False)]) #self.train_env = SubprocVecEnv([self.make_env(env_id, i, eval=False) for i in range(num_e)]) self.train_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True) self.test_env = DummyVecEnv([lambda: Template_Gym(eval=True)]) #self.test_env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_e)]) self.test_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True) self.model_params = self.optimize_ppo2(trial) self.model = PPO2(CustomPolicy_2, self.train_env, verbose=0, nminibatches=1, tensorboard_log=Path("./tensorboard2").name, **self.model_params) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1" ) last_reward = -np.finfo(np.float16).max #evaluation_interval = int(len(train_df) / self.n_evaluations) evaluation_interval = 3000 for eval_idx in range(self.n_evaluations): try: self.model.learn(evaluation_interval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 obs = self.test_env.reset() while n_episodes < self.n_test_episodes: action, _ = self.model.predict(obs) obs, reward, done, _ = self.test_env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = self.test_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
def _add_normalization_wrapper(env, n_envs, normalize): if isinstance(normalize, bool): env = VecNormalize(env) elif isinstance(normalize, dict): if 'trained_agent' in normalize: path = normalize.pop('trained_agent') env = VecNormalize.load(path, env) env.training = normalize.pop('training', True) elif normalize.pop('precompute', False): samples = normalize.pop('samples', 10000) env = _precompute_normalization(env, n_envs, samples, normalize) else: env = VecNormalize(env, **normalize) return env
def make_alrs_env(args, test=False, baseline=False): """ Make a new ALRS environment with parameters specified as command line arguments. """ from environment import AdaptiveLearningRateOptimizer env = make_vec_env( env_id=AdaptiveLearningRateOptimizer, n_envs=1 if test else args.num_envs, env_kwargs={ 'dataset': args.dataset, 'architecture': args.architecture, 'batch_size': args.batch_size, 'update_freq': args.update_freq, 'num_train_steps': args.num_train_steps, 'initial_lr': args.initial_lr, 'discrete': args.discrete, 'action_range': np.inf if baseline else args.action_range, 'lr_noise': not (test or baseline) } ) env = VecNormalize( venv=env, norm_obs=args.ppo2_norm_obs, norm_reward=args.ppo2_norm_reward, clip_obs=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10, clip_reward=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10, gamma=args.ppo2_gamma ) env.alrs = env.venv.envs[0].env return env
def train(self, num_e=1, n_timesteps=100000000, save_fraction=0.1, save='saves/min1'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1") #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save + str(i)) self.env.save_running_average(log_dir) self.env.save_running_average(log_dir)
def __init__(self, **params): super().__init__(**params) self.Model = PPO2 self.solver_signature = "gym_" + ParameterManager.get_param_footprint(self.get_footprint_params()) # parameters from our config, not the original one self.days = self.params['dataset']["days"] env_id = "TaxiEnv-v01" self.env_params = self.load_env_params() seed = np.random.randint(1,10000) self.log['seed'] = seed if self.params.get("lstm", 0) == 1: Policy = MlpLstmPolicy nminibatches = 1 num_cpu = 1 # One current limitation of recurrent policies is that you must test them with the same number of environments they have been trained on. else: Policy = MlpPolicy nminibatches = 4 num_cpu = self.params['num_cpu'] # Create the vectorized environment self.train_env = SubprocVecEnv([self.make_env(env_id, i, seed+i, self.env_params) for i in range(num_cpu)]) self.train_env = VecNormalize(self.train_env, norm_obs=False, norm_reward=False) # self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=nminibatches, tensorboard_log=os.path.join(self.dpath,self.solver_signature)) # minibatches are important, and no parallelism #n_steps=self.params['dataset']['time_periods']+1, self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=4, tensorboard_log=os.path.join(self.dpath,self.solver_signature), n_steps=self.params['dataset']['time_periods']+1)
def create_env(n_envs, eval_env=False, no_log=False): global hyperparams, env_kwargs log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) if normalize: local_normalize_kwargs = {'norm_reward': False} env = VecNormalize(env, **local_normalize_kwargs) return env
def createEnvs(args, allow_early_resets=False, env_kwargs=None, load_path_normalise=None): """ :param args: (argparse.Namespace Object) :param allow_early_resets: (bool) Allow reset before the enviroment is done, usually used in ES to halt the envs :param env_kwargs: (dict) The extra arguments for the environment :param load_path_normalise: (str) the path to loading the rolling average, None if not available or wanted. :return: (Gym VecEnv) """ # imported here to prevent cyclic imports envs = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=allow_early_resets, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] if len(envs) == 1: # No need for subprocesses when having only one env envs = DummyVecEnv(envs) else: envs = SubprocVecEnv(envs) envs = VecFrameStack(envs, args.num_stack) envs = VecNormalize(envs, norm_obs=True, norm_reward=False) # envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def main(log_dir, easy, n_steps=450): exp_dir, seed_offset = get_exp_dir( os.path.join(log_dir, "reacher-obstacle-default" + ("-easy" if easy else "") + "-ppo")) print("Seed offset: " + str(seed_offset)) log_path = os.path.join(exp_dir, "ppo-reach-avoid.log") avg_log_path = exp_dir if not os.path.exists(log_path): n_envs = 8 env = VecNormalize( SubprocVecEnv([create_env_fn(seed_offset * n_envs + i, easy=easy) for i in range(0, n_envs)]), gamma=0.999) model = PPO2(policy='MlpPolicy', env=env, n_steps=n_steps, nminibatches=5, verbose=1, gamma=0.999, noptepochs=15, ent_coef=1e-3, lam=1, policy_kwargs=dict(layers=[164, 164])) average_rewards = [] def log_callback(local_vars, global_vars): avg_r = np.mean([ep_info['r'] for ep_info in local_vars["ep_info_buf"]]) average_rewards.append(avg_r) return True # 3067500 = 409 iterations (400 + 9 for buffer initialization) * 50 trajectories * 150 timesteps model.learn(3067500, seed=seed_offset, callback=log_callback) model.save(log_path) env.save_running_average(avg_log_path) np.save(os.path.join(exp_dir, "rewards.npy"), np.array(average_rewards))
def create_env(n_envs, env_name=None, log_dir=None): return VecNormalize(make_vec_env(ENVS[env_name][env_id], n_envs=n_envs, env_kwargs=ENVS[env_name][env_kwargs], monitor_dir=log_dir), norm_obs=False, norm_reward=True)
def train(self, num_e=1, n_timesteps=1000000, save_fraction=0.0125, save='saves/audbuyh4120', config=config): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.config = config self.env = SubprocVecEnv([ self.make_env(env_id, i, eval=False, config=self.config) for i in range(num_e) ]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True) self.model = PPO2(CnnPolicy, self.env, verbose=0) #self.model = PPO2("MlpPolicy", self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5 ) #self.model = PPO2(CustomPolicy_4, self.env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_4h_r", **self.config.params ) #self.model = PPO2(CustomPolicy_5, self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5 )#**self.config.params #self.model = PPO2.load('saves/playerdetails39', self.env, policy=CustomPolicy, tensorboard_log="./playerdetailsex" ) #self.model = PPO2.load(self.config.path+str(79)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" #self.env.load_running_average(log_dir) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(self.config.save + str(i))
def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(policy=CnnPolicy, #env=SubprocVecEnv(self.env_fns), #n_steps=8192, #nminibatches=8, #lam=0.95, #gamma=0.99, #noptepochs=4, #ent_coef=0.001, #learning_rate=lambda _: 2e-5, #cliprange=lambda _: 0.2, #verbose=1, #tensorboard_log="./breakorbust") self.model = PPO2(CustomPolicy, env=self.env, verbose=0, learning_rate=1e-5, tensorboard_log=save) for i in range(10): self.model.learn(n_timesteps) self.model.save(save)
def evaluate(self, num_env=1, num_steps=1461, load='saves/audbuyh1', runs=80, config=pc.configgbpchf4h): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = config.year+config.pair num_e = 1 self.config = config log_dir = self.config.log #log_dir = self.config.norm #self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_env)]) self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True, config=self.config) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) try: self.env.load_running_average(log_dir) except: print('cant load') for i in range(runs): #self.model = PPO2(CustomPolicy, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./moose14" ) #self.model = PPO2.load(self.config.path, self.env, policy=CustomPolicy_2, tensorboard_log="./default/" ) self.model = PPO2.load(self.config.path+'8'+str(i)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(self.env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) #self.env.save_running_average(log_dir) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) #self.env.save(log_dir) return mean_reward
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format( normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def main(cfg, run_dir): run_name = make_run_name(cfg) output_dir = run_dir / run_name output_dir.mkdir(parents=True) with (output_dir / 'config.json').open('w') as fp: json.dump(cfg, fp, indent=2) # Setting log levels to cut out minor errors os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.logging.set_verbosity(tf.logging.ERROR) log_dir = output_dir / cfg['log_dir'] tensorboard_dir = output_dir / cfg['tb_dir'] configure(log_dir=str(log_dir), format_strs=['log', 'csv', 'tensorboard'], tensorboard_dir=str(tensorboard_dir)) # Create and wrap the environment logging.info('Starting {env_name}'.format(**cfg)) env = make_atari_env(env_id=cfg['env_name'], num_env=8, seed=cfg['train_seed']) env = VecFrameStack(env, n_stack=4) if cfg['normalize']: env = VecNormalize(env) # Setting all known random seeds (Python, Numpy, TF, Gym if available) set_global_seeds(cfg['train_seed']) logging.info('Running {algo}'.format(**cfg)) algo = get_algo(cfg['algo']) policy = cfg['policy_type'] feature_extractor = get_network_builder(cfg['network']) attn_loss = get_loss(cfg['attn_loss'])() model = algo( policy=policy, env=env, verbose=1, learning_rate=lambda frac: 0.00025 * frac, attn_loss=attn_loss, attn_coef=cfg['attn_coef'], policy_kwargs={ 'cnn_extractor': feature_extractor, }, tensorboard_log=str(tensorboard_dir), ) logging.info('Training for {time_steps} steps'.format(**cfg)) # Training model.learn( total_timesteps=cfg['time_steps'], log_interval=cfg['log_interval'], tb_log_name=None, callback=Callback(output_dir), )
def test_vec_normalize(): env = DummyVecEnv([lambda: gym.make("Pendulum-v0")]) normalized_vec_env = VecNormalize(env) obs = normalized_vec_env.reset() for _ in range(10): action = [normalized_vec_env.action_space.sample()] obs, reward, _, _ = normalized_vec_env.step(action) print(obs, reward)
def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her', env=VecNormalize(env)) model = HER.load('./test_her') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.pkl'): os.remove('./test_her.pkl')
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None, env_kwargs=None): if hyperparams is None: hyperparams = {} if env_kwargs is None: env_kwargs = {} # Create the environment and wrap it if necessary if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) else: # start_method = 'spawn' for thread safe env = DummyVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=None, env_kwargs=env_kwargs) for i in range(n_envs) ]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def vec_env(env_name, num_envs=4, seed=33, norm_rew=True, load_path=None): '''creates environments, vectorizes them and sets different seeds :param norm_rew: reward should only be normalized during training :param load_path: if set, the VecNormalize environment will load the running means from this path. :returns: VecNormalize (wrapped Subproc- or Dummy-VecEnv) ''' from gym_mimic_envs.mimic_env import MimicEnv from gym_mimic_envs.monitor import Monitor as EnvMonitor def make_env_func(env_name, seed, rank): def make_env(): env = gym.make(env_name) env.seed(seed + rank * 100) if isinstance(env, MimicEnv): # wrap a MimicEnv in the EnvMonitor # has to be done before converting into a VecEnv! env = EnvMonitor(env) return env return make_env if num_envs == 1: vec_env = DummyVecEnv([make_env_func(env_name, seed, 0)]) else: env_fncts = [ make_env_func(env_name, seed, rank) for rank in range(num_envs) ] vec_env = SubprocVecEnv(env_fncts) # normalize environments # if a load_path was specified, load the running mean and std of obs and rets from this path if load_path is not None: vec_normed = VecNormalize.load(load_path, vec_env) # todo: think the whole else statement can be deleted. # In case, we want to load obs_rms from an earlier run, # we should be able to do it by just specifying a load_path... # the same way as when we load a complete trained model. else: try: from scripts.common.config import is_mod, MOD_LOAD_OBS_RMS if not is_mod(MOD_LOAD_OBS_RMS): raise Exception # load the obs_rms from a previously trained model init_obs_rms_path = abs_project_path + \ 'models/behav_clone/models/rms/env_999' vec_normed = VecNormalize.load(init_obs_rms_path, vec_env) log('Successfully loaded OBS_RMS from a previous model:', [ f'file:\t {init_obs_rms_path}', f'mean:\t {vec_normed.obs_rms.mean}', f'var:\t {vec_normed.obs_rms.var}' ]) except: log('Do NOT loading obs_rms from a previous run.') vec_normed = VecNormalize(vec_env, norm_obs=True, norm_reward=norm_rew) return vec_normed
def evaluate(self, num_env=1, num_steps=175200, load="saves/min", runs=2): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = 'default' num_e = 1 log_dir = "saves" self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.env.load_running_average(log_dir) for i in range(runs): self.model = PPO2.load(load + str(i), self.env, policy=CustomPolicy_2, tensorboard_log="./default/") self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=False) obs, rewards, dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward
def run(): """ The main function of the agent Parses argv and executes accordingly """ visualize = sys.argv[1] == "v" if len(sys.argv) > 1 else False resume = sys.argv[1] == "r" if len(sys.argv) > 1 else False evaluate = visualize or (sys.argv[1] == "e" if len(sys.argv) > 1 else False) loadpath = sys.argv[2] if resume or evaluate else "" print("Setting up env") env = SubprocVecEnv([make_env(ENV, i) for i in range(N_PROCS)], start_method='spawn') eval_env = DummyVecEnv([make_env(ENV, i) for i in range(N_PROCS)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10.) print("Setting up model") if not (resume or evaluate): env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) model = Model(env=env, eval_env=eval_env, env_name=ENV_NAME, seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS) else: model = Model.load(loadpath, env, eval_env=eval_env, env_name=ENV_NAME, seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS) #model = Model(env=None, eval_env=eval_env, env_name="FieldEnv", seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS) if not evaluate: model.trainAndSave() else: model.evaluate(visualize)
def main(): all_ports = [] parser = argparse.ArgumentParser() parser.add_argument("algorithm", help='Which algorithm are you using', type=str) parser.add_argument("training_timesteps", help="How many traning steps are there?", type=int) parser.add_argument("testing_timesteps", help="How many testing steps are there?", type=int) parser.add_argument("training_iterations", help="How many traning iterations are there?", type=int) parser.add_argument("testing_iterations", help="How many traning iterations are there?", type=int) parser.add_argument("learning_rate", help="What is the learning rate?", type=float) parser.add_argument("batch_size", help="What is the batch size?", type=int) parser.add_argument("building_port", help="What is the building_port?", type=int) parser.add_argument("reward_port", help="What is the reward_port?", type=int) parser.add_argument("agent_port", help="What is the agent_port?", type=int) args = parser.parse_args() all_ports = [args.building_port, args.reward_port, args.agent_port] df11 = pd.DataFrame(all_ports) df11.to_csv('allports.csv', index=False) hostname = socket.gethostname() # Path path = os.path.join(sys.path[0], hostname) # os.mkdir(path) path_for_kill_file = os.path.join(sys.path[0], "kill.sh") env = gym.make('RCRS-v2') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) run_model(args.algorithm, args.training_timesteps, args.testing_timesteps, args.training_iterations, args.testing_iterations, args.learning_rate, args.batch_size, env=env, hostname=hostname, path_for_kill_file=path_for_kill_file)
def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def load_train_env(num_envs, robot_radius, rew_fnc, num_stacks, stack_offset, debug, task_mode, policy, disc_action_space, normalize): # Choosing environment wrapper according to the policy if policy == "CnnPolicy" or policy == "CnnLnLstmPolicy" or policy == "CnnLstmPolicy": if disc_action_space: env_temp = RosEnvDiscImg else: env_temp = RosEnvContImg elif policy == "CNN1DPolicy": if disc_action_space: env_temp = RosEnvDiscRawScanPrepWp else: env_temp = RosEnvContRawScanPrepWp elif policy == "CNN1DPolicy_multi_input": if disc_action_space: env_temp = RosEnvDiscRaw else: env_temp = RosEnvContRaw elif policy == "CnnPolicy_multi_input_vel" or policy == "CnnPolicy_multi_input_vel2": if disc_action_space: env_temp = RosEnvDiscImgVel else: env_temp = RosEnvContImgVel env = SubprocVecEnv([ lambda k=k: Monitor(env_temp( "sim%d" % (k + 1), StateCollector("sim%s" % (k + 1), "train"), stack_offset, num_stacks, robot_radius, rew_fnc, debug, "train", task_mode), '%s/%s/sim_%d' % (path_to_models, agent_name, k + 1), allow_early_resets=True) for k in range(num_envs) ]) # Normalizing? if normalize: env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_obs=100.0, clip_reward=10.0, gamma=0.99, epsilon=1e-08) else: env = env # Stack of data? if num_stacks > 1: env = VecFrameStack(env, n_stack=num_stacks, n_offset=stack_offset) return env
def run_ppo_policies(easy, main_dir, n_exps): env = VecNormalize(DummyVecEnv( [create_env_fn(0, monitored=False, easy=easy)]), gamma=0.999, training=False) states = [] for i in range(1, n_exps + 1): states.append( np.array( run_ppo_policy(env, os.path.join(main_dir, "exp-" + str(i))))) return states
def make_env(env_id, env_args, seed, is_train, with_vecnorm): monitor_dir = os.path.join(env_args['log_file'], 'log') if is_train: # env for training env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args), seed=seed, monitor_dir=monitor_dir, n_envs=1) if with_vecnorm: env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) # env for evaluation during training env_args['renders'] = False if 'dset' in env_args: env_args['dset'] = 'eval' eval_env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args), seed=seed + 1, monitor_dir=monitor_dir + '/eval', n_envs=1) if with_vecnorm: eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) else: env = gym.make(env_id, **env_args) eval_env = None return env, eval_env
def create_env(n_envs): """ Create the environment and wrap it if necessary :param n_envs: (int) :return: (gym.Env) """ global hyperparams if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif args.algo in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print( "WARNING: normalization not supported yet for DDPG/DQN" ) # No env_wrapper applied for now as not using make_env() env = gym.make(env_id) env.seed(args.seed) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def vecEnv(env_kwargs_local, env_class): """ Local Env Wrapper :param env_kwargs_local: arguments related to the environment wrapper :param env_class: class of the env :return: env for the pretrained algo """ train_env = env_class(**{ **env_kwargs_local, "record_data": False, "renders": False }) train_env = DummyVecEnv([lambda: train_env]) train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False) return train_env
def create_env(env_name, config=None, n_workers=8, image_based=True, **kwargs): """ Parses the environment to correctly return the attributes based on the spec and type Creates a corresponding vectorized environment """ def make_rl(**kwargs): """ Decorator for custom RL environments """ def _init(): env_obj = getattr(rl.environments, env_name) env = env_obj(config) return env return _init def make_gym(rank, seed=0, **kwargs): """ Decorator for gym environments """ def _init(): env = gym.make(env_name) env.seed(seed + rank) return env return _init if config is not None: n_workers = config['main']['n_workers'] mapping = {'gym': make_gym, 'rl': make_rl} env_type = get_env_type(env_name) env_decorator = mapping[env_type] vectorized_decorator = [env_decorator(rank=x) for x in range(n_workers)] # Parallelize if n_workers > 1: method = 'spawn' if sys.platform == 'win32' else 'forkserver' vectorized = SubprocVecEnv(vectorized_decorator, start_method=method) else: # Non multi-processing env vectorized = DummyVecEnv(vectorized_decorator) # Frame-stacking for CNN based environments if 'frame_stack' in config['main'].keys(): if config['main']['frame_stack'] != 0: vectorized = VecFrameStack(vectorized, n_stack=config['main']['frame_stack']) if 'normalize' in config['main'].keys(): vectorized = VecNormalize(vectorized, clip_obs=1, clip_reward=1) return vectorized
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): # Even though DeepQ is single core only, we need to use the pipe system to work if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(1, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe env = DummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)]) if args.srl_model != "raw_pixels": env = VecNormalize(env, norm_reward=False) env = loadRunningAverage(env, load_path_normalise=load_path_normalise) return env