示例#1
0
def test_custom_vec_env(tmp_path):
    """
    Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests.
    """
    monitor_dir = tmp_path / "test_make_vec_env/"
    env = make_vec_env(
        "CartPole-v1",
        n_envs=1,
        monitor_dir=monitor_dir,
        seed=0,
        vec_env_cls=SubprocVecEnv,
        vec_env_kwargs={"start_method": None},
    )

    assert env.num_envs == 1
    assert isinstance(env, SubprocVecEnv)
    assert os.path.isdir(monitor_dir)
    # Kill subprocess
    env.close()
    # Cleanup folder
    shutil.rmtree(monitor_dir)

    # This should fail because DummyVecEnv does not have any keyword argument
    with pytest.raises(TypeError):
        make_vec_env("CartPole-v1", n_envs=1, vec_env_kwargs={"dummy": False})
    def _ppo_training(cls, env_name: str, env_kwargs: Dict[str, Any],
                      agent_kwargs: Dict[str, Any]) -> bool:
        """ Run PPO algorithm on a given algorithm and check if the reward
        threshold has been exceeded.
        """
        # Create a multiprocess environment
        train_env = make_vec_env(env_id=env_name,
                                 env_kwargs=env_kwargs,
                                 n_envs=int(N_THREADS // 2),
                                 vec_env_cls=SubprocVecEnv,
                                 seed=SEED)
        test_env = make_vec_env(env_id=env_name,
                                env_kwargs=env_kwargs,
                                n_envs=1,
                                vec_env_cls=DummyVecEnv,
                                seed=SEED)

        # Create the learning agent according to the chosen algorithm
        config = cls._get_default_config_stable_baselines()
        config.update(agent_kwargs)
        train_agent = PPO('MlpPolicy', train_env, **config, verbose=False)
        train_agent.eval_env = test_env

        # Run the learning process
        return train(train_agent, max_timesteps=150000)
示例#3
0
def test_vec_env_monitor_kwargs():
    env = make_vec_env("MountainCarContinuous-v0",
                       n_envs=1,
                       seed=0,
                       monitor_kwargs={"allow_early_resets": False})
    assert env.get_attr("allow_early_resets")[0] is False

    env = make_atari_env("BreakoutNoFrameskip-v4",
                         n_envs=1,
                         seed=0,
                         monitor_kwargs={"allow_early_resets": False})
    assert env.get_attr("allow_early_resets")[0] is False

    env = make_vec_env("MountainCarContinuous-v0",
                       n_envs=1,
                       seed=0,
                       monitor_kwargs={"allow_early_resets": True})
    assert env.get_attr("allow_early_resets")[0] is True

    env = make_atari_env(
        "BreakoutNoFrameskip-v4",
        n_envs=1,
        seed=0,
        monitor_kwargs={"allow_early_resets": True},
    )
    assert env.get_attr("allow_early_resets")[0] is True
def ppo_stable_baselines_training():
    wandb.run = config.tensorboard.run
    wandb.tensorboard.patch(save=False, tensorboardX=True)

    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    envs = make_vec_env(config.env_name, n_envs=config.num_processes)

    model = PPO("CnnPolicy",
                envs,
                verbose=1,
                tensorboard_log="./runs/",
                clip_range=config.clip_param,
                n_steps=50,
                learning_rate=config.lr,
                gamma=config.gamma,
                gae_lambda=config.gae_lambda,
                ent_coef=config.entropy_coef,
                max_grad_norm=config.max_grad_norm,
                vf_coef=config.value_loss_coef,
                batch_size=config.num_mini_batch)
    model.learn(total_timesteps=config.num_steps,
                log_interval=1,
                callback=WandbStableBaselines3Callback())
    model.save(f"{config.env_name}_stable_baselines_ppo")
示例#5
0
def test_vec_env_wrapper_kwargs():
    env = make_vec_env("MountainCarContinuous-v0",
                       n_envs=1,
                       seed=0,
                       wrapper_class=MaxAndSkipEnv,
                       wrapper_kwargs={"skip": 3})
    assert env.get_attr("_skip")[0] == 3
示例#6
0
def init_adv(adv_env_id, disable_adv=False, env_kwargs=None):
    bridge = Bridge()
    default_env_kwargs = {
        'renders' if 'CartPole' in adv_env_id else 'render': render
    }
    if env_kwargs is None:
        env_kwargs = {}
    env_kwargs.update(default_env_kwargs)
    env = make_vec_env(adv_env_id, env_kwargs=env_kwargs, seed=seed)
    env = VecNormalize(env)
    prot_agent = PPO('MlpPolicy',
                     env,
                     verbose=verbose,
                     seed=seed,
                     n_steps=ts,
                     bridge=bridge,
                     is_protagonist=True)
    if disable_adv:
        bridge.link_agents(prot_agent, None)
    else:
        adv_agent = PPO('MlpPolicy',
                        env,
                        verbose=verbose,
                        seed=seed,
                        n_steps=ts,
                        bridge=bridge,
                        is_protagonist=False)
        bridge.link_agents(prot_agent, adv_agent)
    return prot_agent, env
示例#7
0
def test_multiprocessing(model_class):
    use_discrete_actions = model_class not in [SAC, TD3, DDPG]

    def make_env():
        env = DummyDictEnv(use_discrete_actions=use_discrete_actions, channel_last=False)
        env = gym.wrappers.TimeLimit(env, 100)
        return env

    env = make_vec_env(make_env, n_envs=2, vec_env_cls=SubprocVecEnv)

    kwargs = {}
    n_steps = 256

    if model_class in {A2C, PPO}:
        kwargs = dict(
            n_steps=128,
            policy_kwargs=dict(
                net_arch=[32],
                features_extractor_kwargs=dict(cnn_output_dim=32),
            ),
        )

    model = model_class("MultiInputPolicy", env, gamma=0.5, seed=1, **kwargs)

    model.learn(total_timesteps=n_steps)
示例#8
0
def create_zoo_env(env_id, stats_dir, hyperparams, should_render=False):
    env_wrapper = get_wrapper_class(hyperparams)

    vec_env_cls = DummyVecEnv
    if "Bullet" in env_id and should_render:
        vec_env_cls = SubprocVecEnv

    env = make_vec_env(env_id,
                       wrapper_class=env_wrapper,
                       vec_env_cls=vec_env_cls)

    if stats_dir is not None:
        if hyperparams["normalize"]:
            norm_fpath = pjoin(stats_dir, "vecnormalize.pkl")

            if os.path.exists(norm_fpath):
                env = VecNormalize.load(norm_fpath, env)
                env.training = False
                env.norm_reward = False
            else:
                raise ValueError(f"VecNormalize stats {norm_fpath} not found")

    max_episode_steps = gym.make(env_id).spec.max_episode_steps
    Spec = namedtuple("Spec", ["max_episode_steps"])
    env.spec = Spec(max_episode_steps=max_episode_steps)

    return env
示例#9
0
def get_env(op_policies, conf):
    env = make_vec_env(Expando,
                       env_kwargs=dict(**conf,
                                       policies_other=op_policies),
                       n_envs=1)
    env.reset()
    return env
示例#10
0
def test_replay_buffer_normalization(replay_buffer_cls):
    env = {ReplayBuffer: DummyEnv, DictReplayBuffer: DummyDictEnv}[replay_buffer_cls]
    env = make_vec_env(env)
    env = VecNormalize(env)

    buffer = replay_buffer_cls(100, env.observation_space, env.action_space)

    # Interract and store transitions
    env.reset()
    obs = env.get_original_obs()
    for _ in range(100):
        action = env.action_space.sample()
        _, _, done, info = env.step(action)
        next_obs = env.get_original_obs()
        reward = env.get_original_reward()
        buffer.add(obs, next_obs, action, reward, done, info)
        obs = next_obs

    sample = buffer.sample(50, env)
    # Test observation normalization
    for observations in [sample.observations, sample.next_observations]:
        if isinstance(sample, DictReplayBufferSamples):
            for key in observations.keys():
                assert th.allclose(observations[key].mean(0), th.zeros(1), atol=1)
        elif isinstance(sample, ReplayBufferSamples):
            assert th.allclose(observations.mean(0), th.zeros(1), atol=1)
    # Test reward normalization
    assert np.allclose(sample.rewards.mean(0), np.zeros(1), atol=1)
示例#11
0
def create_environment(config):
    if config.atari_wrapper:
        env = make_atari_env(config.environment, n_envs=config.workers)
        env = VecFrameStack(env, n_stack = 1)
    else:
        env = make_vec_env(config.environment, n_envs=config.workers)
    env = DummyEnvWrapper(env, config.add_stoch)
    return env
示例#12
0
def test_discrete_obs_space(model_class, env):
    env = make_vec_env(env, n_envs=2, seed=0)
    kwargs = {}
    if model_class == DQN:
        kwargs = dict(buffer_size=1000, learning_starts=100)
    else:
        kwargs = dict(n_steps=256)
    model_class("MlpPolicy", env, **kwargs).learn(256)
示例#13
0
 def make_vec_env(self,dataset, env_args):
     env_args["df"]= dataset
     env = make_vec_env('crypt-v001', env_kwargs=env_args)
     env = VecCheckNan(env, raise_exception=True)
     env = VecNormalize(
         env, norm_obs=True, norm_reward=False, clip_obs=10.0, gamma=0.95
     )
     return env
示例#14
0
def test_warn_dqn_multi_env():
    with pytest.warns(UserWarning, match="The number of environments used is greater"):
        DQN(
            "MlpPolicy",
            make_vec_env("CartPole-v1", n_envs=2),
            buffer_size=100,
            target_update_interval=1,
        )
示例#15
0
    def create_envs(self,
                    n_envs: int,
                    eval_env: bool = False,
                    no_log: bool = False) -> VecEnv:
        """
        Create the environment and wrap it if necessary.

        :param n_envs:
        :param eval_env: Whether is it an environment used for evaluation or not
        :param no_log: Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: the vectorized environment, with appropriate wrappers
        """
        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else self.save_path

        # env = SubprocVecEnv([make_env(env_id, i, self.seed) for i in range(n_envs)])
        # On most env, SubprocVecEnv does not help and is quite memory hungry
        env = make_vec_env(
            env_id=self.env_id,
            n_envs=n_envs,
            seed=self.seed,
            env_kwargs=self.env_kwargs,
            monitor_dir=log_dir,
            wrapper_class=self.env_wrapper,
            vec_env_cls=self.vec_env_class,
            vec_env_kwargs=self.vec_env_kwargs,
        )

        # Special case for GoalEnvs: log success rate too
        if "Neck" in self.env_id or self.is_robotics_env(self.env_id):
            self._log_success_rate(env)

        # Wrap the env into a VecNormalize wrapper if needed
        # and load saved statistics when present
        env = self._maybe_normalize(env, eval_env)

        # Optional Frame-stacking
        if self.frame_stack is not None:
            n_stack = self.frame_stack
            env = VecFrameStack(env, n_stack)
            if self.verbose > 0:
                print(f"Stacking {n_stack} frames")

        # Wrap if needed to re-order channels
        # (switch from channel last to channel first convention)
        if is_image_space(env.observation_space):
            if self.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)

        # check if wrapper for dict support is needed
        if self.algo == "her":
            if self.verbose > 0:
                print("Wrapping into a ObsDictWrapper")
            env = ObsDictWrapper(env)

        return env
def train_from_logs(algo,
                    env_id,
                    eval_env=None,
                    log_dir="logs",
                    total_timesteps=300000,
                    tensorboard_log=None,
                    seed=0,
                    verbose=0,
                    n_envs=4,
                    outdir="results",
                    use_sde=True,
                    i=0):

    if eval_env is None:
        eval_env = env_id
    # create env
    if (algo in ["a2c", "ppo"]):
        env = make_vec_env(env_id, n_envs=n_envs, seed=seed)
    else:
        env = make_vec_env(env_id, n_envs=1, seed=seed)
    # Create and train agent
    agent = AGENT[algo]
    hyper = best_hyperpars(log_dir, env_id, algo, i=i)
    print("")
    print(algo, env_id)
    print(hyper)

    # Unless turned off in hyperparameters.yml
    # env = VecNormalize(env, gamma = hyper["params_gamma"])

    model = agent(env,
                  hyper,
                  'MlpPolicy',
                  verbose=verbose,
                  tensorboard_log=tensorboard_log,
                  seed=seed,
                  use_sde=use_sde)
    model.learn(total_timesteps=total_timesteps)
    # evaluate agent
    custom_eval(model,
                eval_env,
                algo,
                seed=seed,
                outdir=outdir,
                value=hyper["value"])
示例#17
0
def multiprocessing_with_off_policy_algorithms_example():
    # Multiprocessing with off-policy algorithms.

    env = make_vec_env("Pendulum-v1", n_envs=4, seed=0)

    # We collect 4 transitions per call to 'env.step()' and performs 2 gradient steps per call to 'env.step()'
    # if gradient_steps=-1, then we would do 4 gradients steps per call to 'env.step()'.
    model = SAC("MlpPolicy", env, train_freq=1, gradient_steps=2, verbose=1)
    model.learn(total_timesteps=10_000)
示例#18
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
示例#19
0
def process(layers, case_number, steps, envs, verbose):
    name = f"c3a_A2C_{str(layers).replace(' ', '')}_{case_number}"
    print(f"Case: {name}")
    env = make_vec_env('PerigeeRaising-Continuous3D-v0',
                       n_envs=envs,
                       wrapper_class=lambda x: wrap(x))
    agent = create_agent(env, name, case_number, layers, verbose)
    print(f"  --> Training...")
    train_agent(agent, name, steps=steps, callbacks=[])
    print(f"  --> Testing...")
    test_agent(agent)
示例#20
0
 def make_env(rank: int, count: int) -> VecEnv:
     return make_vec_env(
         ENV_NAME,
         n_envs=count,
         seed=RANDOM_SEED + rank,
         start_index=0,
         monitor_dir=None,
         wrapper_class=atari_wrapper,
         env_kwargs=None,
         vec_env_cls=None,
         vec_env_kwargs=None,
         monitor_kwargs=None,
     )
示例#21
0
def test_offpolicy_multi_env(model_class):
    kwargs = {}
    if model_class in [SAC, TD3, DDPG]:
        env_id = "Pendulum-v0"
        policy_kwargs = dict(net_arch=[64], n_critics=1)
        # Check auto-conversion to VectorizedActionNoise
        kwargs = dict(action_noise=NormalActionNoise(np.zeros(1), 0.1 *
                                                     np.ones(1)))
        if model_class == SAC:
            kwargs["use_sde"] = True
            kwargs["sde_sample_freq"] = 4
    else:
        env_id = "CartPole-v1"
        policy_kwargs = dict(net_arch=[64])

    def make_env():
        env = gym.make(env_id)
        # to check that the code handling timeouts runs
        env = gym.wrappers.TimeLimit(env, 50)
        return env

    env = make_vec_env(make_env, n_envs=2)
    model = model_class(
        "MlpPolicy",
        env,
        policy_kwargs=policy_kwargs,
        learning_starts=100,
        buffer_size=10000,
        verbose=0,
        train_freq=5,
        **kwargs,
    )
    model.learn(total_timesteps=150)

    # Check that gradient_steps=-1 works as expected:
    # perform as many gradient_steps as transitions collected
    train_freq = 3
    model = model_class(
        "MlpPolicy",
        env,
        policy_kwargs=policy_kwargs,
        learning_starts=0,
        buffer_size=10000,
        verbose=0,
        train_freq=train_freq,
        gradient_steps=-1,
        **kwargs,
    )
    model.learn(total_timesteps=train_freq)
    assert model.logger.name_to_value[
        "train/n_updates"] == train_freq * env.num_envs
示例#22
0
def main():
    # get init time and use it for save path
    now = datetime.now()
    save_path = './trained/' + now.strftime("%B %d, %Y - %H.%M")
    os.mkdir(save_path)
    # using sound library for pure fun
    engine = pyttsx3.init()  # object creation
    engine.setProperty('rate', 150)  # setting up new voice rate

    with open('config.yml') as file:
        configurations = yaml.safe_load(file)
    configurations['general']['flightgear'] = 'false'
    configurations['general']['agent_interaction_freq'] = 5
    with open('config.yml', 'w') as file:
        yaml.dump(configurations, file)

    env_make = make_vec_env(configurations['general']['env'], n_envs=1, seed=0)
    env = VecNormalize(env_make, norm_obs=True, norm_reward=True, clip_obs=10.)

    # Stop training when the model reaches the reward threshold
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300,
                                                     verbose=1)
    eval_callback = EvalCallback(
        env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=save_path,
        eval_freq=configurations['train']['timesteps'] / 100,
        deterministic=True)
    with open(save_path + '/env.pkl', "wb") as file_handler:
        pickle.dump(env, file_handler, pickle.HIGHEST_PROTOCOL)

    if configurations['train']['model'] == "none":
        print("--> Alican's LOG: A new model will be created for training")
        model = Agents.create_model(env,
                                    configurations['general']['algorithm'],
                                    save_path)
    else:
        print(
            "--> Alican's LOG: An already existed model will be used for training"
        )
        model = Agents.load_model(
            env, configurations['general']['algorithm'],
            configurations['train']['model'] + '/best_model')

    model.learn(total_timesteps=configurations['train']['timesteps'],
                callback=eval_callback,
                log_interval=20)

    engine.say("Training is finished!")
    engine.runAndWait()
    engine.stop()
示例#23
0
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]

    env_params = {
        'time_step': TIME_STEP,
        'robot_class': QuadrupedRobot,
        'on_rack': False,
        'enable_self_collision': True,
        'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT,
        'train_or_test': test_or_train
    }

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]}

    if TEST_OR_TRAIN == "train":
        env = make_vec_env(env_change_input,
                           n_envs=NUM_CPUS,
                           seed=0,
                           env_kwargs=env_params,
                           vec_env_cls=SubprocVecEnv)
        env = VecNormalize(env)
        if not (os.path.exists(policy_save_dir)):
            os.makedirs(policy_save_dir)
        model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        # env = env_change_input(time_step=env_params['time_step'],
        #                        robot_class=env_params['robot_class'],
        #                        on_rack=env_params['on_rack'],
        #                        enable_self_collision=env_params['enable_self_collision'],
        #                        motor_control_mode=env_params['motor_control_mode'],
        #                        train_or_test=env_params['train_or_test'])
        env = env_change_input(**env_params)
        model_load_path = os.path.join(policy_save_dir,
                                       'ppo_3_17-03-2021_15-39-42')
        model = PPO.load(model_load_path)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
示例#24
0
def train_and_test_ec(config, video_length_=1000, total_timesteps_=10000):
    print(config)
    if config.atari_wrapper:
        train_env = make_atari_env(config.environment, n_envs=config.workers)
        train_env = VecFrameStack(train_env, n_stack=1)
        shape = (84, 84, 1)
    else:
        train_env = make_vec_env(config.environment, n_envs=config.workers)
        shape = train_env.observation_space.shape

    rnet = RNetwork(shape, config.ensemble_size)
    vec_episodic_memory = [
        EpisodicMemory([64],
                       rnet.embedding_similarity,
                       replacement='random',
                       capacity=200) for _ in range(config.workers)
    ]
    target_image_shape = list(shape)
    #assert type(config.add_stoch) == bool, "Please indicated whether or not you want stoch added"
    train_env = CuriosityEnvWrapper(train_env, vec_episodic_memory,
                                    rnet.embed_observation, target_image_shape,
                                    config.add_stoch)
    r_network_trainer = RNetworkTrainer(rnet,
                                        learning_rate=config.rnet_lr,
                                        observation_history_size=2000,
                                        training_interval=1000)
    train_env.add_observer(r_network_trainer)
    tb_dir = os.path.join(config.log_dir, config.tb_subdir)
    model = config.agent(config.policy_model,
                         train_env,
                         config,
                         verbose=config.verbose,
                         tensorboard_log=tb_dir)

    model.learn(total_timesteps=total_timesteps_)

    print("stopped to learn")
    #model.save("models/"+config.experiment)

    obs = train_env.reset()

    for i in range(video_length_ + 1):

        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = train_env.step(action)
        train_env.render()
        if done.any():
            obs = train_env.reset()

    train_env.close()
    def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv:
        """
        Create the environment and wrap it if necessary.
        :param n_envs:
        :param eval_env: Whether is it an environment used for evaluation or not
        :param no_log: Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: the vectorized environment, with appropriate wrappers
        """
        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else self.save_path

        monitor_kwargs = {}
        # Special case for GoalEnvs: log success rate too
        if "Neck" in self.env_id or self.is_robotics_env(self.env_id) or "parking-v0" in self.env_id:
            monitor_kwargs = dict(info_keywords=("is_success",))

        # On most env, SubprocVecEnv does not help and is quite memory hungry
        # therefore we use DummyVecEnv by default
        env = make_vec_env(
            env_id=self.env_id,
            n_envs=n_envs,
            seed=self.seed,
            env_kwargs=self.env_kwargs,
            monitor_dir=None,                       # Avoid useless monitor file spam from plotting
            wrapper_class=self.env_wrapper,
            vec_env_cls=self.vec_env_class,
            vec_env_kwargs=self.vec_env_kwargs,
            monitor_kwargs=monitor_kwargs,
        )

        # Wrap the env into a VecNormalize wrapper if needed
        # and load saved statistics when present
        env = self._maybe_normalize(env, eval_env)

        # Optional Frame-stacking
        if self.frame_stack is not None:
            n_stack = self.frame_stack
            env = VecFrameStack(env, n_stack)
            if self.verbose > 0:
                print(f"Stacking {n_stack} frames")

        # Wrap if needed to re-order channels
        # (switch from channel last to channel first convention)
        if is_image_space(env.observation_space) and not is_image_space_channels_first(env.observation_space):
            if self.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)

        return env
def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class):
    env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, wrapper_class=wrapper_class, monitor_dir=None, seed=0)

    assert env.num_envs == n_envs

    if vec_env_cls is None:
        assert isinstance(env, DummyVecEnv)
        if wrapper_class is not None:
            assert isinstance(env.envs[0], wrapper_class)
        else:
            assert isinstance(env.envs[0], Monitor)
    else:
        assert isinstance(env, SubprocVecEnv)
    # Kill subprocesses
    env.close()
示例#27
0
def run_exp(exp_params):
    activation_fn = activation_fns[exp_params['act']]
    layers = [int(nb) for nb in exp_params['layers'].split(',')]
    nb_threads = int(exp_params['nb_threads'])
    freq_save = int(exp_params['save_every'])
    env_id = 'Trading-{}'.format(envs[exp_params['env']])
    train_steps = int(exp_params['train_steps']) * 10e4

    tmp_env = gym.make(env_id)
    tmp_env.reset()
    env_data = tmp_env.get_env_specs()

    trained_agents = glob.glob('trained_models/{}/*'.format(
        env_data['folder_name'] if 'folder_name' in
        env_data.keys() else env_data['env_name']))
    run_idx = len([agent for agent in trained_agents if 'agent' in agent])

    run_name = 'agent_{:03d}'.format(run_idx)

    model = PPO('MlpPolicy',
                make_vec_env(env_id, nb_threads),
                verbose=1,
                device=torch.device('cpu'),
                tensorboard_log='./runs/{}/'.format(
                    env_data['folder_name'] if 'folder_name' in
                    env_data.keys() else env_data['env_name']))

    model.learn(total_timesteps=train_steps, tb_log_name=run_name)

    env_data = model.env.envs[0].get_env_specs()
    env_data['run_name'] = run_name

    env_folder = 'trained_models/{}'.format(
        env_data['folder_name'] if 'folder_name' in
        env_data.keys() else env_data['env_name'])
    if not os.path.exists('trained_models'):
        os.mkdir('trained_models')
    if not os.path.exists(env_folder):
        os.mkdir(env_folder)
    if not os.path.exists('{}/{}'.format(env_folder, run_name)):
        os.mkdir('{}/{}'.format(env_folder, run_name))
        model.save('{}/{}/{}'.format(env_folder, run_name, run_name))

    recap = pd.Series(env_data.values(), index=env_data.keys())
    recap.to_csv('{}/{}/recap.csv'.format(env_folder, run_name), index=True)
示例#28
0
def test_evaluate_vector_env(n_envs):
    # Tests that the number of episodes evaluated is correct
    n_eval_episodes = 6

    env = make_vec_env("CartPole-v1", n_envs)
    model = A2C("MlpPolicy", "CartPole-v1", seed=0)

    class CountCallback:
        def __init__(self):
            self.count = 0

        def __call__(self, locals_, globals_):
            if locals_["done"]:
                self.count += 1

    count_callback = CountCallback()

    evaluate_policy(model, env, n_eval_episodes, callback=count_callback)

    assert count_callback.count == n_eval_episodes
def train(args):
    cuda_availability = torch.cuda.is_available()
    print('\n*************************')
    print('`CUDA` available: {}'.format(cuda_availability))
    print('Device specified: {}'.format(args.device))
    print('*************************\n')

    # load the config of the trained model:
    with open(args.pretrained_output / "train_arguments.yaml") as yaml_data:
        pretrain_arguments = yaml.load(yaml_data, Loader=yaml.FullLoader)

    pretrained_model = algorithms[pretrain_arguments["alg"]].load(
        args.pretrained_output /
        "".join(pretrain_arguments["model_name"].split(".")[:-1]),
        device='cpu')

    # Prepare tensorboard logging
    log_name = '{}_{}'.format(pretrain_arguments["experiment_name"],
                              datetime.now().strftime('%d-%m_%H-%M-%S'))
    run_dir = args.tensorboard_log + "/" + log_name
    Path(run_dir).mkdir(parents=True, exist_ok=True)
    callbacks = []
    # callbacks.append(CheckpointCallback(
    #    save_freq=1000000, save_path=run_dir, name_prefix='rl_model'))
    callbacks.append(LoggingCallback(logpath=run_dir))

    train_args = copy.copy(pretrain_arguments)
    pyaml.dump(train_args,
               open(os.path.join(run_dir, 'train_arguments.yaml'), 'w'))

    # Create the vectorized environment
    n_envs = pretrain_arguments["n_envs"]  # Number of processes to use
    env = make_vec_env(pretrain_arguments["task_name"], n_envs=n_envs)

    pretrained_model.env = env
    pretrained_model.learn(total_timesteps=args.total_timesteps,
                           callback=callbacks,
                           tb_log_name=log_name)

    pretrained_model.save(
        os.path.join(args.tensorboard_log + "/" + log_name, args.model_name))
示例#30
0
def objective(trial):
    # Getting the hyperparameters to test
    params, policy_kwargs = algo_utils[args.algorithm][0](trial)
    # Flag to keep track of whether using vectorized environment or not
    # Instatiating the environments
    env = make_vec_env(args.env, n_envs=params["n_envs"])
    params.pop("n_envs")
    # Instatiating model and performing training
    model = algo_utils[args.algorithm][1]("MlpPolicy",
                                          env,
                                          verbose=0,
                                          policy_kwargs=policy_kwargs,
                                          **params)
    model.learn(total_timesteps=int(args.n_timesteps))
    # Evaluating the agent and reporting the mean cumulative reward
    eval_env = gym.make(args.env)
    eval_df = simulate_mdp_vec(env, eval_env, model, args.n_eval_episodes)
    mean_rew = eval_df.groupby(["rep"]).sum().mean(axis=0)["reward"]
    del model

    return mean_rew