Пример #1
0
    def evaluate(self, num_env=1, num_steps=21900, load="saves/aud5", runs=10):
        """
        Evaluate a RL agent
        :param model: (BaseRLModel object) the RL Agent
        :param num_steps: (int) number of timesteps to evaluate it
        :return: (float) Mean reward
        """
        env_id = 'default'
        num_e = 1
        log_dir = "saves"
        self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)])
        #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" )
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        self.env.load_running_average(log_dir)
        for i in range(runs):
            self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_4,  tensorboard_log="./default/" )
            self.env.load_running_average(log_dir)
            episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
            #self.total_pips = []
            obs = self.env.reset()
            state = None
            # When using VecEnv, done is a vector
            done = [False for _ in range(env.num_envs)]
            for i in range(num_steps):
                # _states are only useful when using LSTM policies
                action, state = self.model.predict(obs, state=state, mask=done, deterministic=True)
                obs, rewards , dones, _ = self.env.step(action)
                #actions, _states = self.model.predict(obs)
                # # here, action, rewards and dones are arrays
                 # # because we are using vectorized env
                #obs, rewards, dones, info = self.env.step(actions)
                #self.total_pips.append(self.env.player.placement)
      
        # Stats
                for i in range(self.env.num_envs):
                    episode_rewards[i][-1] += rewards[i]
                    if dones[i]:
                        episode_rewards[i].append(0.0)

            mean_rewards =  [0.0 for _ in range(self.env.num_envs)]
            n_episodes = 0
            for i in range(self.env.num_envs):
                mean_rewards[i] = np.mean(episode_rewards[i])     
                n_episodes += len(episode_rewards[i])   

        # Compute mean reward
            mean_reward = np.mean(mean_rewards)
            print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward
Пример #2
0
    def pre_train(self, num_e=1, load="saves/m19"):
        env_id = 'default'
        num_e = 1
        log_dir = "saves"
        # Usingenv = make_env() only one expert trajectory
        # you can specify `traj_limitation=-1` for using the whole dataset
        dataset = ExpertDataset(expert_path='default2.npz',
                                traj_limitation=1,
                                batch_size=128)
        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        #env = Template_Gym()
        #self.env = DummyVecEnv([lambda: env])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #env = make_env()
        #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1)
        self.env.load_running_average("saves")
        self.model = PPO2(CustomPolicy,
                          self.env,
                          verbose=1,
                          nminibatches=1,
                          learning_rate=1e-5,
                          tensorboard_log="./m1ln4")
        #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" )
        self.env.load_running_average("saves")
        # Pretrain the PPO2 model
        self.model.pretrain(dataset, n_epochs=10000)

        # As an option, you can train the RL agent
        #self.model.learn(int(100000000))

        # Test the pre-trained model
        self.env = self.model.get_env()
        self.env.load_running_average("saves")
        obs = self.env.reset()

        reward_sum = 0.0
        for _ in range(1000000):
            action, _ = self.model.predict(obs)
            obs, reward, done, _ = self.env.step(action)
            reward_sum += reward
            #self.env.render()
            if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = self.env.reset()

        self.env.close()
Пример #3
0
def createEnvs(args,
               allow_early_resets=False,
               env_kwargs=None,
               load_path_normalise=None):
    """
    :param args: (argparse.Namespace Object)
    :param allow_early_resets: (bool) Allow reset before the enviroment is done, usually used in ES to halt the envs
    :param env_kwargs: (dict) The extra arguments for the environment
    :param load_path_normalise: (str) the path to loading the rolling average, None if not available or wanted.
    :return: (Gym VecEnv)
    """
    # imported here to prevent cyclic imports

    envs = [
        makeEnv(args.env,
                args.seed,
                i,
                args.log_dir,
                allow_early_resets=allow_early_resets,
                env_kwargs=env_kwargs) for i in range(args.num_cpu)
    ]

    if len(envs) == 1:
        # No need for subprocesses when having only one env
        envs = DummyVecEnv(envs)
    else:
        envs = SubprocVecEnv(envs)

    envs = VecFrameStack(envs, args.num_stack)

    envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
    # envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise)

    return envs
Пример #4
0
    def __init__(self, **params):
        super().__init__(**params)
        self.Model = PPO2
        self.solver_signature = "gym_" + ParameterManager.get_param_footprint(self.get_footprint_params())

        # parameters from our config, not the original one
        self.days = self.params['dataset']["days"]
        env_id = "TaxiEnv-v01"
        self.env_params = self.load_env_params()

        seed = np.random.randint(1,10000)
        self.log['seed'] = seed

        if self.params.get("lstm", 0) == 1:
            Policy = MlpLstmPolicy
            nminibatches = 1
            num_cpu = 1 # One current limitation of recurrent policies is that you must test them with the same number of environments they have been trained on.
        else:
            Policy = MlpPolicy
            nminibatches = 4
            num_cpu = self.params['num_cpu']
        # Create the vectorized environment
        self.train_env = SubprocVecEnv([self.make_env(env_id, i, seed+i, self.env_params) for i in range(num_cpu)])

        self.train_env = VecNormalize(self.train_env, norm_obs=False, norm_reward=False)

        # self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=nminibatches, tensorboard_log=os.path.join(self.dpath,self.solver_signature))
                                # minibatches are important, and no parallelism
                                #n_steps=self.params['dataset']['time_periods']+1,
        self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=4, tensorboard_log=os.path.join(self.dpath,self.solver_signature), n_steps=self.params['dataset']['time_periods']+1)
Пример #5
0
 def train(self,
           num_e=1,
           n_timesteps=1000000,
           save_fraction=0.0125,
           save='saves/audbuyh4120',
           config=config):
     env_id = "default"
     num_e = 1  # Number of processes to use
     # Create the vectorized environment
     #env = DummyVecEnv([lambda: env])
     #Ramona
     self.config = config
     self.env = SubprocVecEnv([
         self.make_env(env_id, i, eval=False, config=self.config)
         for i in range(num_e)
     ])
     #env = Template_Gym()
     #self.env = DummyVecEnv([lambda: env])
     self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True)
     self.model = PPO2(CnnPolicy, self.env, verbose=0)
     #self.model = PPO2("MlpPolicy", self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5  )
     #self.model = PPO2(CustomPolicy_4, self.env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_4h_r", **self.config.params )
     #self.model = PPO2(CustomPolicy_5, self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5  )#**self.config.params
     #self.model = PPO2.load('saves/playerdetails39', self.env, policy=CustomPolicy,  tensorboard_log="./playerdetailsex" )
     #self.model = PPO2.load(self.config.path+str(79)+'.pkl', self.env, policy=CustomPolicy_5,  tensorboard_log="./default/" )
     #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" )
     n_timesteps = n_timesteps * save_fraction
     n_timesteps = int(n_timesteps)
     training_loop = 1 / save_fraction
     training_loop = int(training_loop)
     log_dir = "saves"
     #self.env.load_running_average(log_dir)
     for i in range(training_loop):
         self.model.learn(n_timesteps)
         self.model.save(self.config.save + str(i))
Пример #6
0
def create_env(n_envs, eval_env=False, no_log=False):

    global hyperparams, env_kwargs
    log_dir = None if eval_env or no_log else save_path

    if n_envs == 1:
        env = DummyVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=env_kwargs)
        ])
    else:
        env = DummyVecEnv([
            make_env(env_id,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=env_kwargs)
        ])
        if normalize:
            local_normalize_kwargs = {'norm_reward': False}
            env = VecNormalize(env, **local_normalize_kwargs)

    return env
Пример #7
0
    def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'):
        env_id = "default"
        num_e = 1  # Number of processes to use
        # Create the vectorized environment
        #env = DummyVecEnv([lambda: env])

        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #self.model = PPO2(policy=CnnPolicy,
        #env=SubprocVecEnv(self.env_fns),
        #n_steps=8192,
        #nminibatches=8,
        #lam=0.95,
        #gamma=0.99,
        #noptepochs=4,
        #ent_coef=0.001,
        #learning_rate=lambda _: 2e-5,
        #cliprange=lambda _: 0.2,
        #verbose=1,
        #tensorboard_log="./breakorbust")
        self.model = PPO2(CustomPolicy,
                          env=self.env,
                          verbose=0,
                          learning_rate=1e-5,
                          tensorboard_log=save)
        for i in range(10):
            self.model.learn(n_timesteps)
            self.model.save(save)
Пример #8
0
 def load(path, env, eval_env, env_name, seed, n_procs, num_steps):
     """
     Load a model from a folder (overrides base method)
     :param env: (Environment) the environment / environment ID
     :param eval_env: (Environment) the environment to evaluate models on
     :param env_name: (str) the name used for saving models
     :param seed: (int) seed for randomization
     :param n_procs: (int) the number of processes used in training
     :param num_steps: (int) the number of steps to train / retrain for
     """
     env = VecNormalize.load(path + ".env", env)
     ret = ALGO.load(path,
                     env=env,
                     verbose=1,
                     tensorboard_log="./tensorboard/",
                     seed=seed,
                     nminibatches=NMINIBATCHES)
     ret.__class__ = Model
     ret.n_procs = n_procs
     ret.num_steps = num_steps
     ret.seed = seed
     ret.eval_env = eval_env
     ret.loaded = True
     ret.env_name = env_name
     return ret
Пример #9
0
    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        :return: (gym.Env)
        """
        global hyperparams

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            if n_envs == 1:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             wrapper_class=env_wrapper,
                             log_dir=log_dir)
                ])
            else:
                # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                # On most env, SubprocVecEnv does not help and is quite memory hungry
                env = DummyVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=log_dir,
                             wrapper_class=env_wrapper) for i in range(n_envs)
                ])
            if normalize:
                if args.verbose > 0:
                    if len(normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **normalize_kwargs)
        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
            del hyperparams['frame_stack']
        return env
Пример #10
0
def render():
    initializer = RandomInitializer(difficulty=1)

    def get_multi_process_env(num_of_envs):
        def _make_env(rank):
            def _init():
                out_env = CubeEnv(
                    frameskip=5,
                    visualization=True,
                    initializer=initializer,
                    action_type=ActionType.POSITION,
                    observation_type=ObservationType.WITHOUT_GOALS)
                out_env.seed(seed=rank)
                out_env.action_space.seed(seed=rank)
                out_env = FlatObservationWrapper(out_env)
                return out_env

            return _init

        return DummyVecEnv([_make_env(rank=i) for i in range(num_of_envs)])

    render_env = VecNormalize.load("models/PPO_09_14_2020_19_06_26.pkl",
                                   get_multi_process_env(1))

    model = PPO2.load("models/checkpoint_saves/rl_model_10000000_steps",
                      env=render_env)

    obs = model.env.reset()
    is_done = False
    while not is_done:
        action, _ = model.predict(obs)
        obs, rew, is_done, info = render_env.step(action)

    print("Reward at final step: {:.3f}".format(rew))
Пример #11
0
def create_env(n_envs, env_name=None, log_dir=None):
    return VecNormalize(make_vec_env(ENVS[env_name][env_id],
                                     n_envs=n_envs,
                                     env_kwargs=ENVS[env_name][env_kwargs],
                                     monitor_dir=log_dir),
                        norm_obs=False,
                        norm_reward=True)
Пример #12
0
def main(cfg, run_dir):
    run_name = make_run_name(cfg)
    output_dir = run_dir / run_name
    output_dir.mkdir(parents=True)

    with (output_dir / 'config.json').open('w') as fp:
        json.dump(cfg, fp, indent=2)

    # Setting log levels to cut out minor errors
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    tf.logging.set_verbosity(tf.logging.ERROR)

    log_dir = output_dir / cfg['log_dir']
    tensorboard_dir = output_dir / cfg['tb_dir']

    configure(log_dir=str(log_dir),
              format_strs=['log', 'csv', 'tensorboard'],
              tensorboard_dir=str(tensorboard_dir))

    # Create and wrap the environment
    logging.info('Starting {env_name}'.format(**cfg))
    env = make_atari_env(env_id=cfg['env_name'],
                         num_env=8,
                         seed=cfg['train_seed'])
    env = VecFrameStack(env, n_stack=4)
    if cfg['normalize']:
        env = VecNormalize(env)

    # Setting all known random seeds (Python, Numpy, TF, Gym if available)
    set_global_seeds(cfg['train_seed'])

    logging.info('Running {algo}'.format(**cfg))

    algo = get_algo(cfg['algo'])
    policy = cfg['policy_type']
    feature_extractor = get_network_builder(cfg['network'])
    attn_loss = get_loss(cfg['attn_loss'])()
    model = algo(
        policy=policy,
        env=env,
        verbose=1,
        learning_rate=lambda frac: 0.00025 * frac,
        attn_loss=attn_loss,
        attn_coef=cfg['attn_coef'],
        policy_kwargs={
            'cnn_extractor': feature_extractor,
        },
        tensorboard_log=str(tensorboard_dir),
    )

    logging.info('Training for {time_steps} steps'.format(**cfg))

    # Training
    model.learn(
        total_timesteps=cfg['time_steps'],
        log_interval=cfg['log_interval'],
        tb_log_name=None,
        callback=Callback(output_dir),
    )
Пример #13
0
 def gen_pre_train(self, num_e=1, save='default2', episodes=1000):
     #self.create_envs(game_name=game, state_name=state, num_env=num_e)
     #self.env=SubprocVecEnv(self.env_fns)
     env_id = 'default'
     num_e = 1
     self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)])
     #env = Template_Gym()
     #self.env = DummyVecEnv([lambda: env])
     self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
     #env = make_env()
     #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1)
     self.env.load_running_average("saves")
     self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" )
     self.env.load_running_average("saves")
     #env = make_env()
     #self.expert_agent = 
     generate_expert_traj(self.model, save, self.env, n_episodes=episodes)
Пример #14
0
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=3,
                goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her', env=VecNormalize(env))

    model = HER.load('./test_her')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS,
                          continuous=model_class in [DDPG, SAC],
                          max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.pkl'):
        os.remove('./test_her.pkl')
Пример #15
0
    def f(path: str, env: gym.Env) -> BasePolicy:
        """Loads a policy saved to path, for environment env."""
        tf.logging.info(f"Loading Stable Baselines policy for '{cls}' "
                        f"from '{path}'")
        model_path = os.path.join(path, 'model.pkl')
        model = cls.load(model_path, env=env)
        policy = getattr(model, policy_attr)

        try:
            vec_normalize = VecNormalize(env, training=False)
            vec_normalize.load_running_average(path)
            policy = NormalizePolicy(policy, vec_normalize)
            tf.logging.info(f"Loaded normalization statistics from '{path}'")
        except FileNotFoundError:
            # We did not use VecNormalize during training, skip
            pass

        return policy
Пример #16
0
def create_env(args, env_id, godot_instances, params, session_path, eval=False):
    n = 1 if eval else args.n_agents_per_env
    env = SubprocVecEnv([make_godot_env(env_id, f'{obs_port}_{i}', obs_port, action_port,
                                        args, session_path, eval, seed=obs_port * i)
                         for i in range(n) for obs_port, action_port in godot_instances])

    vecnorm_path = get_vec_normalize_filepath(params, args)
    if vecnorm_path.exists():
        print(f'found vecnormalize data file @ {vecnorm_path.absolute()}. loading existing file.')
        env = VecNormalize.load(vecnorm_path, env)
    else:
        print(f'unable to find existing vecnormalize data file @ vecnorm_path.absolute(). creating a new one.')
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=1.0, clip_reward=100.0)

    if args.n_stack > 1:
        env = VecFrameStack(env, n_stack=args.n_stack)

    return env
Пример #17
0
def run():
    """
    The main function of the agent
    Parses argv and executes accordingly
    """
    visualize = sys.argv[1] == "v" if len(sys.argv) > 1 else False
    resume = sys.argv[1] == "r" if len(sys.argv) > 1 else False
    evaluate = visualize or (sys.argv[1] == "e"
                             if len(sys.argv) > 1 else False)
    loadpath = sys.argv[2] if resume or evaluate else ""
    print("Setting up env")
    env = SubprocVecEnv([make_env(ENV, i) for i in range(N_PROCS)],
                        start_method='spawn')

    eval_env = DummyVecEnv([make_env(ENV, i) for i in range(N_PROCS)])
    eval_env = VecNormalize(eval_env,
                            norm_obs=True,
                            norm_reward=False,
                            clip_obs=10.)

    print("Setting up model")

    if not (resume or evaluate):
        env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
        model = Model(env=env,
                      eval_env=eval_env,
                      env_name=ENV_NAME,
                      seed=SEED,
                      n_procs=N_PROCS,
                      num_steps=NUM_STEPS)
    else:
        model = Model.load(loadpath,
                           env,
                           eval_env=eval_env,
                           env_name=ENV_NAME,
                           seed=SEED,
                           n_procs=N_PROCS,
                           num_steps=NUM_STEPS)
        #model = Model(env=None, eval_env=eval_env, env_name="FieldEnv", seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS)

    if not evaluate:
        model.trainAndSave()
    else:
        model.evaluate(visualize)
Пример #18
0
def main():
    all_ports = []
    parser = argparse.ArgumentParser()
    parser.add_argument("algorithm",
                        help='Which algorithm are you using',
                        type=str)
    parser.add_argument("training_timesteps",
                        help="How many traning steps are there?",
                        type=int)
    parser.add_argument("testing_timesteps",
                        help="How many testing steps are there?",
                        type=int)
    parser.add_argument("training_iterations",
                        help="How many traning iterations are there?",
                        type=int)
    parser.add_argument("testing_iterations",
                        help="How many traning iterations are there?",
                        type=int)
    parser.add_argument("learning_rate",
                        help="What is the learning rate?",
                        type=float)
    parser.add_argument("batch_size", help="What is the batch size?", type=int)
    parser.add_argument("building_port",
                        help="What is the building_port?",
                        type=int)
    parser.add_argument("reward_port",
                        help="What is the reward_port?",
                        type=int)
    parser.add_argument("agent_port", help="What is the agent_port?", type=int)
    args = parser.parse_args()
    all_ports = [args.building_port, args.reward_port, args.agent_port]

    df11 = pd.DataFrame(all_ports)
    df11.to_csv('allports.csv', index=False)

    hostname = socket.gethostname()
    # Path
    path = os.path.join(sys.path[0], hostname)
    # os.mkdir(path)
    path_for_kill_file = os.path.join(sys.path[0], "kill.sh")

    env = gym.make('RCRS-v2')
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: env])
    # Automatically normalize the input features
    env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
    run_model(args.algorithm,
              args.training_timesteps,
              args.testing_timesteps,
              args.training_iterations,
              args.testing_iterations,
              args.learning_rate,
              args.batch_size,
              env=env,
              hostname=hostname,
              path_for_kill_file=path_for_kill_file)
Пример #19
0
def test_sync_vec_normalize():
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=10.,
                       clip_reward=10.)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    env = VecFrameStack(env, 1)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env,
                            training=False,
                            norm_obs=True,
                            norm_reward=True,
                            clip_obs=10.,
                            clip_reward=10.)
    eval_env = VecFrameStack(eval_env, 1)

    env.reset()
    # Initialize running mean
    for _ in range(100):
        env.step([env.action_space.sample()])

    obs = env.reset()
    original_obs = env.get_original_obs()
    dummy_rewards = np.random.rand(10)
    # Normalization must be different
    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

    sync_envs_normalization(env, eval_env)

    # Now they must be synced
    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
    assert np.allclose(env.normalize_reward(dummy_rewards),
                       eval_env.normalize_reward(dummy_rewards))
Пример #20
0
def run(args):
    top_folder_idx = args.model.rfind('/')
    top_folder_str = args.model[0:top_folder_idx]
    config_file = top_folder_str + '/config.yaml'
    config = io_utils.load_yaml(config_file)
    normalize = config.get("normalize", False)

    if args.visualize:
        config['simulation']['real_time'] = False
        config['simulation']['visualize'] = True

    task = DummyVecEnv([
        lambda: gym.make(
            'gripper-env-v0', config=config, evaluate=True, test=args.test)
    ])

    if normalize:
        task = VecNormalize(task,
                            training=False,
                            norm_obs=False,
                            norm_reward=True,
                            clip_obs=10.)
        task = VecNormalize.load(
            os.path.join(top_folder_str, 'vecnormalize.pkl'), task)

    # task = gym.make('gripper-env-v0', config=config, evaluate=True, test=args.test)
    model_lower = args.model.lower()
    if 'trpo' == config["algorithm"]:
        agent = sb.TRPO.load(args.model)
    elif 'sac' == config["algorithm"]:
        agent = sb.SAC.load(args.model)
    elif 'ppo' == config["algorithm"]:
        agent = sb.PPO2.load(args.model)
    elif 'dqn' == config["algorithm"]:
        agent = sb.DQN.load(args.model)
    elif 'bdq' == config["algorithm"]:
        agent = sb.BDQ.load(args.model)
    else:
        raise Exception
    print("Run the agent")
    run_agent(task, agent, args.stochastic)
    task.close()
Пример #21
0
    def _save_normalization_artifacts(self) -> None:
        # if normalize is active
        if isinstance(self.eval_env, VecNormalize) and not self.continue_learning:
            path = os.path.join(self.log_dir, "vecnormalize.pkl")
            if self.model.get_vec_normalize_env() is not None:
                self.model.get_vec_normalize_env().save(path)
                if self.verbose > 1:
                    print("Saving VecNormalize to {}".format(path))

            # don't know why but rewards are still normalized
            self.eval_env = VecNormalize.load(os.path.join(self.log_dir, "vecnormalize.pkl"), self.eval_env.unwrapped)
Пример #22
0
def _make_warmstart_cartpole():
    """Warm-start VecNormalize by stepping through CartPole"""
    venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    venv = VecNormalize(venv)
    venv.reset()
    venv.get_original_obs()

    for _ in range(100):
        actions = [venv.action_space.sample()]
        venv.step(actions)
    return venv
Пример #23
0
def create_env(env_name, normalized, Training=False):
    env = gym.make(env_name)

    if normalized:
        from stable_baselines.common.vec_env import VecNormalize, DummyVecEnv
        vec_env = DummyVecEnv([lambda: env])
        vec_env = VecNormalize.load('data/models/env_stats/'+env_name+'.pkl',
                            venv=vec_env)
        vec_env.training = Training
        vec_env.reward_range = env.reward_range

    return env
Пример #24
0
def load_train_env(num_envs, robot_radius, rew_fnc, num_stacks, stack_offset,
                   debug, task_mode, policy, disc_action_space, normalize):
    # Choosing environment wrapper according to the policy
    if policy == "CnnPolicy" or policy == "CnnLnLstmPolicy" or policy == "CnnLstmPolicy":
        if disc_action_space:
            env_temp = RosEnvDiscImg
        else:
            env_temp = RosEnvContImg
    elif policy == "CNN1DPolicy":
        if disc_action_space:
            env_temp = RosEnvDiscRawScanPrepWp
        else:
            env_temp = RosEnvContRawScanPrepWp
    elif policy == "CNN1DPolicy_multi_input":
        if disc_action_space:
            env_temp = RosEnvDiscRaw
        else:
            env_temp = RosEnvContRaw
    elif policy == "CnnPolicy_multi_input_vel" or policy == "CnnPolicy_multi_input_vel2":
        if disc_action_space:
            env_temp = RosEnvDiscImgVel
        else:
            env_temp = RosEnvContImgVel

    env = SubprocVecEnv([
        lambda k=k: Monitor(env_temp(
            "sim%d" % (k + 1), StateCollector("sim%s" %
                                              (k + 1), "train"), stack_offset,
            num_stacks, robot_radius, rew_fnc, debug, "train", task_mode),
                            '%s/%s/sim_%d' %
                            (path_to_models, agent_name, k + 1),
                            allow_early_resets=True) for k in range(num_envs)
    ])

    # Normalizing?
    if normalize:
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_obs=100.0,
                           clip_reward=10.0,
                           gamma=0.99,
                           epsilon=1e-08)
    else:
        env = env

    # Stack of data?
    if num_stacks > 1:
        env = VecFrameStack(env, n_stack=num_stacks, n_offset=stack_offset)

    return env
Пример #25
0
def run_ppo_policies(easy, main_dir, n_exps):
    env = VecNormalize(DummyVecEnv(
        [create_env_fn(0, monitored=False, easy=easy)]),
                       gamma=0.999,
                       training=False)

    states = []
    for i in range(1, n_exps + 1):
        states.append(
            np.array(
                run_ppo_policy(env, os.path.join(main_dir, "exp-" + str(i)))))

    return states
Пример #26
0
 def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'):
     env_id = "default"
     num_e = 32  # Number of processes to use
     # Create the vectorized environment
     #env = DummyVecEnv([lambda: env])
     #Ramona
     #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)])
     env = Template_Gym()
     self.env = DummyVecEnv([lambda: env])
     self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
     #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" )
     
     self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1")
     #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" )
     n_timesteps = n_timesteps * save_fraction
     n_timesteps = int(n_timesteps)
     training_loop = 1 / save_fraction
     training_loop = int(training_loop)
     
     for i in range(training_loop):
         self.model.learn(n_timesteps)
         self.model.save(save+str(i))
Пример #27
0
def make_env(env_id, env_args, seed, is_train, with_vecnorm):

    monitor_dir = os.path.join(env_args['log_file'], 'log')

    if is_train:
        # env for training
        env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args),
                           seed=seed,
                           monitor_dir=monitor_dir,
                           n_envs=1)

        if with_vecnorm:
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.,
                               clip_reward=10.)

        # env for evaluation during training
        env_args['renders'] = False
        if 'dset' in env_args:
            env_args['dset'] = 'eval'
        eval_env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args),
                                seed=seed + 1,
                                monitor_dir=monitor_dir + '/eval',
                                n_envs=1)

        if with_vecnorm:
            eval_env = VecNormalize(eval_env,
                                    norm_obs=True,
                                    norm_reward=True,
                                    clip_obs=10.,
                                    clip_reward=10.)

    else:
        env = gym.make(env_id, **env_args)
        eval_env = None

    return env, eval_env
Пример #28
0
        def create_env(n_envs):
            """
            Create the environment and wrap it if necessary
            :param n_envs: (int)
            :return: (gym.Env)
            """
            global hyperparams

            if is_atari:
                if args.verbose > 0:
                    print("Using Atari wrapper")
                env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
                # Frame-stacking with 4 frames
                env = VecFrameStack(env, n_stack=4)
            elif args.algo in ['dqn', 'ddpg']:
                if hyperparams.get('normalize', False):
                    print(
                        "WARNING: normalization not supported yet for DDPG/DQN"
                    )
                # No env_wrapper applied for now as not using make_env()
                env = gym.make(env_id)
                env.seed(args.seed)
            else:
                if n_envs == 1:
                    env = DummyVecEnv([
                        make_env(env_id,
                                 0,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                    ])
                else:
                    # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                    # On most env, SubprocVecEnv does not help and is quite memory hungry
                    env = DummyVecEnv([
                        make_env(env_id,
                                 i,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                        for i in range(n_envs)
                    ])
                if normalize:
                    if args.verbose > 0:
                        print("Normalizing input and return")
                    env = VecNormalize(env, **normalize_kwargs)
            # Optional Frame-stacking
            if hyperparams.get('frame_stack', False):
                n_stack = hyperparams['frame_stack']
                env = VecFrameStack(env, n_stack)
                print("Stacking {} frames".format(n_stack))
                del hyperparams['frame_stack']
            return env
Пример #29
0
def vecEnv(env_kwargs_local, env_class):
    """
    Local Env Wrapper
    :param env_kwargs_local: arguments related to the environment wrapper
    :param env_class: class of the env
    :return: env for the pretrained algo
    """
    train_env = env_class(**{
        **env_kwargs_local, "record_data": False,
        "renders": False
    })
    train_env = DummyVecEnv([lambda: train_env])
    train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False)
    return train_env
Пример #30
0
def create_env(env_name, config=None, n_workers=8, image_based=True, **kwargs):
    """
    Parses the environment to correctly return the attributes based on the spec and type
    Creates a corresponding vectorized environment
    """
    def make_rl(**kwargs):
        """
        Decorator for custom RL environments
        """
        def _init():
            env_obj = getattr(rl.environments, env_name)
            env = env_obj(config)
            return env

        return _init

    def make_gym(rank, seed=0, **kwargs):
        """
        Decorator for gym environments
        """
        def _init():
            env = gym.make(env_name)
            env.seed(seed + rank)
            return env

        return _init

    if config is not None:
        n_workers = config['main']['n_workers']
    mapping = {'gym': make_gym, 'rl': make_rl}
    env_type = get_env_type(env_name)
    env_decorator = mapping[env_type]
    vectorized_decorator = [env_decorator(rank=x) for x in range(n_workers)]

    # Parallelize
    if n_workers > 1:
        method = 'spawn' if sys.platform == 'win32' else 'forkserver'
        vectorized = SubprocVecEnv(vectorized_decorator, start_method=method)
    else:  # Non multi-processing env
        vectorized = DummyVecEnv(vectorized_decorator)

    # Frame-stacking for CNN based environments
    if 'frame_stack' in config['main'].keys():
        if config['main']['frame_stack'] != 0:
            vectorized = VecFrameStack(vectorized,
                                       n_stack=config['main']['frame_stack'])
    if 'normalize' in config['main'].keys():
        vectorized = VecNormalize(vectorized, clip_obs=1, clip_reward=1)

    return vectorized