def evaluate(self, num_env=1, num_steps=21900, load="saves/aud5", runs=10): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = 'default' num_e = 1 log_dir = "saves" self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.env.load_running_average(log_dir) for i in range(runs): self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_4, tensorboard_log="./default/" ) self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward
def pre_train(self, num_e=1, load="saves/m19"): env_id = 'default' num_e = 1 log_dir = "saves" # Usingenv = make_env() only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='default2.npz', traj_limitation=1, batch_size=128) self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2(CustomPolicy, self.env, verbose=1, nminibatches=1, learning_rate=1e-5, tensorboard_log="./m1ln4") #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") # Pretrain the PPO2 model self.model.pretrain(dataset, n_epochs=10000) # As an option, you can train the RL agent #self.model.learn(int(100000000)) # Test the pre-trained model self.env = self.model.get_env() self.env.load_running_average("saves") obs = self.env.reset() reward_sum = 0.0 for _ in range(1000000): action, _ = self.model.predict(obs) obs, reward, done, _ = self.env.step(action) reward_sum += reward #self.env.render() if done: print(reward_sum) reward_sum = 0.0 obs = self.env.reset() self.env.close()
def createEnvs(args, allow_early_resets=False, env_kwargs=None, load_path_normalise=None): """ :param args: (argparse.Namespace Object) :param allow_early_resets: (bool) Allow reset before the enviroment is done, usually used in ES to halt the envs :param env_kwargs: (dict) The extra arguments for the environment :param load_path_normalise: (str) the path to loading the rolling average, None if not available or wanted. :return: (Gym VecEnv) """ # imported here to prevent cyclic imports envs = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=allow_early_resets, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] if len(envs) == 1: # No need for subprocesses when having only one env envs = DummyVecEnv(envs) else: envs = SubprocVecEnv(envs) envs = VecFrameStack(envs, args.num_stack) envs = VecNormalize(envs, norm_obs=True, norm_reward=False) # envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def __init__(self, **params): super().__init__(**params) self.Model = PPO2 self.solver_signature = "gym_" + ParameterManager.get_param_footprint(self.get_footprint_params()) # parameters from our config, not the original one self.days = self.params['dataset']["days"] env_id = "TaxiEnv-v01" self.env_params = self.load_env_params() seed = np.random.randint(1,10000) self.log['seed'] = seed if self.params.get("lstm", 0) == 1: Policy = MlpLstmPolicy nminibatches = 1 num_cpu = 1 # One current limitation of recurrent policies is that you must test them with the same number of environments they have been trained on. else: Policy = MlpPolicy nminibatches = 4 num_cpu = self.params['num_cpu'] # Create the vectorized environment self.train_env = SubprocVecEnv([self.make_env(env_id, i, seed+i, self.env_params) for i in range(num_cpu)]) self.train_env = VecNormalize(self.train_env, norm_obs=False, norm_reward=False) # self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=nminibatches, tensorboard_log=os.path.join(self.dpath,self.solver_signature)) # minibatches are important, and no parallelism #n_steps=self.params['dataset']['time_periods']+1, self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=4, tensorboard_log=os.path.join(self.dpath,self.solver_signature), n_steps=self.params['dataset']['time_periods']+1)
def train(self, num_e=1, n_timesteps=1000000, save_fraction=0.0125, save='saves/audbuyh4120', config=config): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.config = config self.env = SubprocVecEnv([ self.make_env(env_id, i, eval=False, config=self.config) for i in range(num_e) ]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True) self.model = PPO2(CnnPolicy, self.env, verbose=0) #self.model = PPO2("MlpPolicy", self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5 ) #self.model = PPO2(CustomPolicy_4, self.env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_4h_r", **self.config.params ) #self.model = PPO2(CustomPolicy_5, self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5 )#**self.config.params #self.model = PPO2.load('saves/playerdetails39', self.env, policy=CustomPolicy, tensorboard_log="./playerdetailsex" ) #self.model = PPO2.load(self.config.path+str(79)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" #self.env.load_running_average(log_dir) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(self.config.save + str(i))
def create_env(n_envs, eval_env=False, no_log=False): global hyperparams, env_kwargs log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) if normalize: local_normalize_kwargs = {'norm_reward': False} env = VecNormalize(env, **local_normalize_kwargs) return env
def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) self.env = SubprocVecEnv( [self.make_env(env_id, i) for i in range(num_e)]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(policy=CnnPolicy, #env=SubprocVecEnv(self.env_fns), #n_steps=8192, #nminibatches=8, #lam=0.95, #gamma=0.99, #noptepochs=4, #ent_coef=0.001, #learning_rate=lambda _: 2e-5, #cliprange=lambda _: 0.2, #verbose=1, #tensorboard_log="./breakorbust") self.model = PPO2(CustomPolicy, env=self.env, verbose=0, learning_rate=1e-5, tensorboard_log=save) for i in range(10): self.model.learn(n_timesteps) self.model.save(save)
def load(path, env, eval_env, env_name, seed, n_procs, num_steps): """ Load a model from a folder (overrides base method) :param env: (Environment) the environment / environment ID :param eval_env: (Environment) the environment to evaluate models on :param env_name: (str) the name used for saving models :param seed: (int) seed for randomization :param n_procs: (int) the number of processes used in training :param num_steps: (int) the number of steps to train / retrain for """ env = VecNormalize.load(path + ".env", env) ret = ALGO.load(path, env=env, verbose=1, tensorboard_log="./tensorboard/", seed=seed, nminibatches=NMINIBATCHES) ret.__class__ = Model ret.n_procs = n_procs ret.num_steps = num_steps ret.seed = seed ret.eval_env = eval_env ret.loaded = True ret.env_name = env_name return ret
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format( normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def render(): initializer = RandomInitializer(difficulty=1) def get_multi_process_env(num_of_envs): def _make_env(rank): def _init(): out_env = CubeEnv( frameskip=5, visualization=True, initializer=initializer, action_type=ActionType.POSITION, observation_type=ObservationType.WITHOUT_GOALS) out_env.seed(seed=rank) out_env.action_space.seed(seed=rank) out_env = FlatObservationWrapper(out_env) return out_env return _init return DummyVecEnv([_make_env(rank=i) for i in range(num_of_envs)]) render_env = VecNormalize.load("models/PPO_09_14_2020_19_06_26.pkl", get_multi_process_env(1)) model = PPO2.load("models/checkpoint_saves/rl_model_10000000_steps", env=render_env) obs = model.env.reset() is_done = False while not is_done: action, _ = model.predict(obs) obs, rew, is_done, info = render_env.step(action) print("Reward at final step: {:.3f}".format(rew))
def create_env(n_envs, env_name=None, log_dir=None): return VecNormalize(make_vec_env(ENVS[env_name][env_id], n_envs=n_envs, env_kwargs=ENVS[env_name][env_kwargs], monitor_dir=log_dir), norm_obs=False, norm_reward=True)
def main(cfg, run_dir): run_name = make_run_name(cfg) output_dir = run_dir / run_name output_dir.mkdir(parents=True) with (output_dir / 'config.json').open('w') as fp: json.dump(cfg, fp, indent=2) # Setting log levels to cut out minor errors os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.logging.set_verbosity(tf.logging.ERROR) log_dir = output_dir / cfg['log_dir'] tensorboard_dir = output_dir / cfg['tb_dir'] configure(log_dir=str(log_dir), format_strs=['log', 'csv', 'tensorboard'], tensorboard_dir=str(tensorboard_dir)) # Create and wrap the environment logging.info('Starting {env_name}'.format(**cfg)) env = make_atari_env(env_id=cfg['env_name'], num_env=8, seed=cfg['train_seed']) env = VecFrameStack(env, n_stack=4) if cfg['normalize']: env = VecNormalize(env) # Setting all known random seeds (Python, Numpy, TF, Gym if available) set_global_seeds(cfg['train_seed']) logging.info('Running {algo}'.format(**cfg)) algo = get_algo(cfg['algo']) policy = cfg['policy_type'] feature_extractor = get_network_builder(cfg['network']) attn_loss = get_loss(cfg['attn_loss'])() model = algo( policy=policy, env=env, verbose=1, learning_rate=lambda frac: 0.00025 * frac, attn_loss=attn_loss, attn_coef=cfg['attn_coef'], policy_kwargs={ 'cnn_extractor': feature_extractor, }, tensorboard_log=str(tensorboard_dir), ) logging.info('Training for {time_steps} steps'.format(**cfg)) # Training model.learn( total_timesteps=cfg['time_steps'], log_interval=cfg['log_interval'], tb_log_name=None, callback=Callback(output_dir), )
def gen_pre_train(self, num_e=1, save='default2', episodes=1000): #self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.env=SubprocVecEnv(self.env_fns) env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") #env = make_env() #self.expert_agent = generate_expert_traj(self.model, save, self.env, n_episodes=episodes)
def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her', env=VecNormalize(env)) model = HER.load('./test_her') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.pkl'): os.remove('./test_her.pkl')
def f(path: str, env: gym.Env) -> BasePolicy: """Loads a policy saved to path, for environment env.""" tf.logging.info(f"Loading Stable Baselines policy for '{cls}' " f"from '{path}'") model_path = os.path.join(path, 'model.pkl') model = cls.load(model_path, env=env) policy = getattr(model, policy_attr) try: vec_normalize = VecNormalize(env, training=False) vec_normalize.load_running_average(path) policy = NormalizePolicy(policy, vec_normalize) tf.logging.info(f"Loaded normalization statistics from '{path}'") except FileNotFoundError: # We did not use VecNormalize during training, skip pass return policy
def create_env(args, env_id, godot_instances, params, session_path, eval=False): n = 1 if eval else args.n_agents_per_env env = SubprocVecEnv([make_godot_env(env_id, f'{obs_port}_{i}', obs_port, action_port, args, session_path, eval, seed=obs_port * i) for i in range(n) for obs_port, action_port in godot_instances]) vecnorm_path = get_vec_normalize_filepath(params, args) if vecnorm_path.exists(): print(f'found vecnormalize data file @ {vecnorm_path.absolute()}. loading existing file.') env = VecNormalize.load(vecnorm_path, env) else: print(f'unable to find existing vecnormalize data file @ vecnorm_path.absolute(). creating a new one.') env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=1.0, clip_reward=100.0) if args.n_stack > 1: env = VecFrameStack(env, n_stack=args.n_stack) return env
def run(): """ The main function of the agent Parses argv and executes accordingly """ visualize = sys.argv[1] == "v" if len(sys.argv) > 1 else False resume = sys.argv[1] == "r" if len(sys.argv) > 1 else False evaluate = visualize or (sys.argv[1] == "e" if len(sys.argv) > 1 else False) loadpath = sys.argv[2] if resume or evaluate else "" print("Setting up env") env = SubprocVecEnv([make_env(ENV, i) for i in range(N_PROCS)], start_method='spawn') eval_env = DummyVecEnv([make_env(ENV, i) for i in range(N_PROCS)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, clip_obs=10.) print("Setting up model") if not (resume or evaluate): env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) model = Model(env=env, eval_env=eval_env, env_name=ENV_NAME, seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS) else: model = Model.load(loadpath, env, eval_env=eval_env, env_name=ENV_NAME, seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS) #model = Model(env=None, eval_env=eval_env, env_name="FieldEnv", seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS) if not evaluate: model.trainAndSave() else: model.evaluate(visualize)
def main(): all_ports = [] parser = argparse.ArgumentParser() parser.add_argument("algorithm", help='Which algorithm are you using', type=str) parser.add_argument("training_timesteps", help="How many traning steps are there?", type=int) parser.add_argument("testing_timesteps", help="How many testing steps are there?", type=int) parser.add_argument("training_iterations", help="How many traning iterations are there?", type=int) parser.add_argument("testing_iterations", help="How many traning iterations are there?", type=int) parser.add_argument("learning_rate", help="What is the learning rate?", type=float) parser.add_argument("batch_size", help="What is the batch size?", type=int) parser.add_argument("building_port", help="What is the building_port?", type=int) parser.add_argument("reward_port", help="What is the reward_port?", type=int) parser.add_argument("agent_port", help="What is the agent_port?", type=int) args = parser.parse_args() all_ports = [args.building_port, args.reward_port, args.agent_port] df11 = pd.DataFrame(all_ports) df11.to_csv('allports.csv', index=False) hostname = socket.gethostname() # Path path = os.path.join(sys.path[0], hostname) # os.mkdir(path) path_for_kill_file = os.path.join(sys.path[0], "kill.sh") env = gym.make('RCRS-v2') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) run_model(args.algorithm, args.training_timesteps, args.testing_timesteps, args.training_iterations, args.testing_iterations, args.learning_rate, args.batch_size, env=env, hostname=hostname, path_for_kill_file=path_for_kill_file)
def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def run(args): top_folder_idx = args.model.rfind('/') top_folder_str = args.model[0:top_folder_idx] config_file = top_folder_str + '/config.yaml' config = io_utils.load_yaml(config_file) normalize = config.get("normalize", False) if args.visualize: config['simulation']['real_time'] = False config['simulation']['visualize'] = True task = DummyVecEnv([ lambda: gym.make( 'gripper-env-v0', config=config, evaluate=True, test=args.test) ]) if normalize: task = VecNormalize(task, training=False, norm_obs=False, norm_reward=True, clip_obs=10.) task = VecNormalize.load( os.path.join(top_folder_str, 'vecnormalize.pkl'), task) # task = gym.make('gripper-env-v0', config=config, evaluate=True, test=args.test) model_lower = args.model.lower() if 'trpo' == config["algorithm"]: agent = sb.TRPO.load(args.model) elif 'sac' == config["algorithm"]: agent = sb.SAC.load(args.model) elif 'ppo' == config["algorithm"]: agent = sb.PPO2.load(args.model) elif 'dqn' == config["algorithm"]: agent = sb.DQN.load(args.model) elif 'bdq' == config["algorithm"]: agent = sb.BDQ.load(args.model) else: raise Exception print("Run the agent") run_agent(task, agent, args.stochastic) task.close()
def _save_normalization_artifacts(self) -> None: # if normalize is active if isinstance(self.eval_env, VecNormalize) and not self.continue_learning: path = os.path.join(self.log_dir, "vecnormalize.pkl") if self.model.get_vec_normalize_env() is not None: self.model.get_vec_normalize_env().save(path) if self.verbose > 1: print("Saving VecNormalize to {}".format(path)) # don't know why but rewards are still normalized self.eval_env = VecNormalize.load(os.path.join(self.log_dir, "vecnormalize.pkl"), self.eval_env.unwrapped)
def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def create_env(env_name, normalized, Training=False): env = gym.make(env_name) if normalized: from stable_baselines.common.vec_env import VecNormalize, DummyVecEnv vec_env = DummyVecEnv([lambda: env]) vec_env = VecNormalize.load('data/models/env_stats/'+env_name+'.pkl', venv=vec_env) vec_env.training = Training vec_env.reward_range = env.reward_range return env
def load_train_env(num_envs, robot_radius, rew_fnc, num_stacks, stack_offset, debug, task_mode, policy, disc_action_space, normalize): # Choosing environment wrapper according to the policy if policy == "CnnPolicy" or policy == "CnnLnLstmPolicy" or policy == "CnnLstmPolicy": if disc_action_space: env_temp = RosEnvDiscImg else: env_temp = RosEnvContImg elif policy == "CNN1DPolicy": if disc_action_space: env_temp = RosEnvDiscRawScanPrepWp else: env_temp = RosEnvContRawScanPrepWp elif policy == "CNN1DPolicy_multi_input": if disc_action_space: env_temp = RosEnvDiscRaw else: env_temp = RosEnvContRaw elif policy == "CnnPolicy_multi_input_vel" or policy == "CnnPolicy_multi_input_vel2": if disc_action_space: env_temp = RosEnvDiscImgVel else: env_temp = RosEnvContImgVel env = SubprocVecEnv([ lambda k=k: Monitor(env_temp( "sim%d" % (k + 1), StateCollector("sim%s" % (k + 1), "train"), stack_offset, num_stacks, robot_radius, rew_fnc, debug, "train", task_mode), '%s/%s/sim_%d' % (path_to_models, agent_name, k + 1), allow_early_resets=True) for k in range(num_envs) ]) # Normalizing? if normalize: env = VecNormalize(env, training=True, norm_obs=True, norm_reward=False, clip_obs=100.0, clip_reward=10.0, gamma=0.99, epsilon=1e-08) else: env = env # Stack of data? if num_stacks > 1: env = VecFrameStack(env, n_stack=num_stacks, n_offset=stack_offset) return env
def run_ppo_policies(easy, main_dir, n_exps): env = VecNormalize(DummyVecEnv( [create_env_fn(0, monitored=False, easy=easy)]), gamma=0.999, training=False) states = [] for i in range(1, n_exps + 1): states.append( np.array( run_ppo_policy(env, os.path.join(main_dir, "exp-" + str(i))))) return states
def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'): env_id = "default" num_e = 32 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) env = Template_Gym() self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" ) self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1") #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save+str(i))
def make_env(env_id, env_args, seed, is_train, with_vecnorm): monitor_dir = os.path.join(env_args['log_file'], 'log') if is_train: # env for training env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args), seed=seed, monitor_dir=monitor_dir, n_envs=1) if with_vecnorm: env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) # env for evaluation during training env_args['renders'] = False if 'dset' in env_args: env_args['dset'] = 'eval' eval_env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args), seed=seed + 1, monitor_dir=monitor_dir + '/eval', n_envs=1) if with_vecnorm: eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) else: env = gym.make(env_id, **env_args) eval_env = None return env, eval_env
def create_env(n_envs): """ Create the environment and wrap it if necessary :param n_envs: (int) :return: (gym.Env) """ global hyperparams if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif args.algo in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print( "WARNING: normalization not supported yet for DDPG/DQN" ) # No env_wrapper applied for now as not using make_env() env = gym.make(env_id) env.seed(args.seed) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def vecEnv(env_kwargs_local, env_class): """ Local Env Wrapper :param env_kwargs_local: arguments related to the environment wrapper :param env_class: class of the env :return: env for the pretrained algo """ train_env = env_class(**{ **env_kwargs_local, "record_data": False, "renders": False }) train_env = DummyVecEnv([lambda: train_env]) train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False) return train_env
def create_env(env_name, config=None, n_workers=8, image_based=True, **kwargs): """ Parses the environment to correctly return the attributes based on the spec and type Creates a corresponding vectorized environment """ def make_rl(**kwargs): """ Decorator for custom RL environments """ def _init(): env_obj = getattr(rl.environments, env_name) env = env_obj(config) return env return _init def make_gym(rank, seed=0, **kwargs): """ Decorator for gym environments """ def _init(): env = gym.make(env_name) env.seed(seed + rank) return env return _init if config is not None: n_workers = config['main']['n_workers'] mapping = {'gym': make_gym, 'rl': make_rl} env_type = get_env_type(env_name) env_decorator = mapping[env_type] vectorized_decorator = [env_decorator(rank=x) for x in range(n_workers)] # Parallelize if n_workers > 1: method = 'spawn' if sys.platform == 'win32' else 'forkserver' vectorized = SubprocVecEnv(vectorized_decorator, start_method=method) else: # Non multi-processing env vectorized = DummyVecEnv(vectorized_decorator) # Frame-stacking for CNN based environments if 'frame_stack' in config['main'].keys(): if config['main']['frame_stack'] != 0: vectorized = VecFrameStack(vectorized, n_stack=config['main']['frame_stack']) if 'normalize' in config['main'].keys(): vectorized = VecNormalize(vectorized, clip_obs=1, clip_reward=1) return vectorized