def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # Dyn only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1)

    eval_callback = EvalCallback(
        eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100
    )

    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event")
    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    callback = CallbackList([checkpoint_callback, eval_callback, event_callback])

    model.learn(500, callback=callback)
    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)
    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
    def fit(self,
            env,
            episodes,
            verbose,
            episode_steps,
            callbacks,
            log_interval,
            agent_id=-1):
        """Mask the agent fit function
        To train the agent
        """
        logger.info("herer")
        # self.model.learn(total_timesteps=100, log_interval=10)
        #FIXME: use the tb logname meaningful!

        #TODO: Write callback funcs here:
        # List of callback:
        # Checkpoint Callback: save the model every 10 episodes.
        checkpoint_callback = CheckpointCallback(
            save_freq=96,
            save_path=self.agent_helper.config_dir,
            name_prefix='rl_model')
        # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path.
        eval_env = env
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/',
                                     eval_freq=500,
                                     deterministic=True,
                                     render=False)
        # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70,
                                                         verbose=1)
        eval_callback_reward_threshold = EvalCallback(
            eval_env, callback_on_new_best=callback_on_best, verbose=1)
        # EveryNTimeSteps: to call every n time steps to save the model.
        checkpoint_on_event = CheckpointCallback(save_freq=1,
                                                 save_path='./logs/')
        event_callback_after_n_steps = EveryNTimesteps(
            n_steps=500, callback=checkpoint_on_event)

        # StopTrainingOnMaxEpisodes:
        # Stops training when the model reaches the maximum number of episodes
        callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5,
                                                          verbose=1)

        # CallbackList: to call several callback together.
        callbacklist = CallbackList([checkpoint_callback, eval_callback])

        logger.info(f"Model: {self.model.get_env()}")
        with ProgressBarManager(log_interval) as progress_callback:
            self.model.learn(total_timesteps=log_interval,
                             callback=[progress_callback, checkpoint_callback])
        # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        # self.eval_writer(mean_reward, std_reward)
        pass
예제 #3
0
def main():
    # get init time and use it for save path
    now = datetime.now()
    save_path = './trained/' + now.strftime("%B %d, %Y - %H.%M")
    os.mkdir(save_path)
    # using sound library for pure fun
    engine = pyttsx3.init()  # object creation
    engine.setProperty('rate', 150)  # setting up new voice rate

    with open('config.yml') as file:
        configurations = yaml.safe_load(file)
    configurations['general']['flightgear'] = 'false'
    configurations['general']['agent_interaction_freq'] = 5
    with open('config.yml', 'w') as file:
        yaml.dump(configurations, file)

    env_make = make_vec_env(configurations['general']['env'], n_envs=1, seed=0)
    env = VecNormalize(env_make, norm_obs=True, norm_reward=True, clip_obs=10.)

    # Stop training when the model reaches the reward threshold
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300,
                                                     verbose=1)
    eval_callback = EvalCallback(
        env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=save_path,
        eval_freq=configurations['train']['timesteps'] / 100,
        deterministic=True)
    with open(save_path + '/env.pkl', "wb") as file_handler:
        pickle.dump(env, file_handler, pickle.HIGHEST_PROTOCOL)

    if configurations['train']['model'] == "none":
        print("--> Alican's LOG: A new model will be created for training")
        model = Agents.create_model(env,
                                    configurations['general']['algorithm'],
                                    save_path)
    else:
        print(
            "--> Alican's LOG: An already existed model will be used for training"
        )
        model = Agents.load_model(
            env, configurations['general']['algorithm'],
            configurations['train']['model'] + '/best_model')

    model.learn(total_timesteps=configurations['train']['timesteps'],
                callback=eval_callback,
                log_interval=20)

    engine.say("Training is finished!")
    engine.runAndWait()
    engine.stop()
예제 #4
0
def train(
    model: BaseAlgorithm, timesteps: int, eval_env: GymEnv, model_path: Path
) -> None:
    """
    Train agent moves in his environment. Learning will finish when agent performs given number of timesteps or when mean reward of 10 gameplays reachs value 1.
    :param model: RL agent
    :param timesteps: total number of steps to take (through all episodes)
    :param eval_env: evaluation environment
    :param model_path: location where model will be saved
    :param tb_log_name: the name of the run for tensorboard log
    """
    mlflow_callback = MlflowCallback(model_path)
    reward_threshold_callback = StopTrainingOnRewardThreshold(
        reward_threshold=1
    )
    eval_callback = MlflowEvalCallback(
        eval_env=eval_env, callback_on_new_best=reward_threshold_callback
    )
    callbacks = CallbackList([mlflow_callback, eval_callback])

    model.learn(total_timesteps=timesteps, callback=callbacks)
예제 #5
0
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy",
                        env_name,
                        policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(
        eval_env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=log_folder,
        log_path=log_folder,
        eval_freq=100,
        warn=False,
    )
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=log_folder,
                                             name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100,
                                                      verbose=1)

    callback = CallbackList([
        checkpoint_callback, eval_callback, event_callback,
        callback_max_episodes
    ])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy",
                            envs,
                            policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(
            max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
예제 #6
0
                                    seed=0)
        if env_nam == "hover-aviary-v0":
            eval_env = make_vec_env(HoverAviary,
                                    env_kwargs=sa_env_kwargs,
                                    n_envs=1,
                                    seed=0)
        if env_name == "flythrugate-aviary-v0":
            eval_env = make_vec_env(FlyThruGateAviary,
                                    env_kwargs=sa_env_kwargs,
                                    n_envs=1,
                                    seed=0)
        eval_env = VecTransposeImage(eval_env)

    #### Train the model #######################################
    # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model')
    callback_on_best = StopTrainingOnRewardThreshold(
        reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1)
    eval_callback = EvalCallback(eval_env,
                                 callback_on_new_best=callback_on_best,
                                 verbose=1,
                                 best_model_save_path=filename + '/',
                                 log_path=filename + '/',
                                 eval_freq=int(5000 / ARGS.cpu),
                                 deterministic=True,
                                 render=False)
    model.learn(total_timesteps=int(1e12),
                callback=eval_callback,
                log_interval=100)

    ### Save the model #########################################
    model.save(filename + '/success_model.zip')  # Possibly never achieved
    print(filename)
예제 #7
0
def main():

    set_random_seed(RANDOM_SEED)

    t_start = time()
    name = "LargeFinalLayer"

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)
    os.makedirs(checkpoint_path, exist_ok=True)

    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    env_args = dict(
        frame_skip=4,
        screen_size=84,
        terminal_on_life_loss=True,
        clip_reward=True,
    )

    # Creates a gym environment for an atari game using the specified seed and number of environments
    # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
    # for improved performance..
    # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args)

    def atari_wrapper(env: gym.Env) -> gym.Env:
        env = AtariWrapper(env, **env_args)
        return env

    def make_env(rank: int, count: int) -> VecEnv:
        return make_vec_env(
            ENV_NAME,
            n_envs=count,
            seed=RANDOM_SEED + rank,
            start_index=0,
            monitor_dir=None,
            wrapper_class=atari_wrapper,
            env_kwargs=None,
            vec_env_cls=None,
            vec_env_kwargs=None,
            monitor_kwargs=None,
        )

    train_env = make_env(0, N_ENVS)
    eval_env = make_env(1, 1)

    # required by models in baselines
    train_env = VecTransposeImage(train_env)
    eval_env = VecTransposeImage(eval_env)

    # setup callback to save model at fixed intervals
    save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                       save_path=checkpoint_path,
                                       name_prefix=name)
    stop_callback = StopTrainingOnRewardThreshold(
        reward_threshold=EVAL_THRESHOLD)
    time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
    best_callback = EvalCallback(
        eval_env,
        eval_freq=EVAL_FREQ,
        best_model_save_path=checkpoint_path,
        callback_on_new_best=stop_callback,
    )
    list_callback = CallbackList([save_callback, best_callback, time_callback])

    model = PPO(
        CnnPolicy,
        train_env,
        verbose=VERBOSE,
        batch_size=BATCH_SIZE,
        seed=RANDOM_SEED,
        tensorboard_log=log_path,
        learning_rate=LEARNING_RATE,
        n_steps=UPDATE_STEPS,
        n_epochs=N_EPOCHS,
        ent_coef=ENT_COEF,
        vf_coef=VF_COEF,
        clip_range=CLIP_RANGE,
        device=DEVICE_TYPE,
        policy_kwargs=dict(features_extractor_class=FeatureExtractor),
    )

    config_path = os.path.join(checkpoint_path, "cnn_config")
    zip_path = os.path.join(checkpoint_path, "model.zip")

    # output the model config to a file for easier viewing
    with open(config_path, "w") as file:
        file.write(f"{name}\n")
        file.write(str(model.policy.features_extractor.cnn))

    print("Beginning training...")

    model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
    # model.learn(TRAIN_STEPS, tb_log_name="run")
    model.save(zip_path)

    del train_env
    # del eval_env

    time_taken = time() - t_start

    print("Beginning evaluation...")

    # score of the game, standard deviation of multiple runs
    reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

    with open(results_path, "w") as handle:
        handle.write(json.dumps((reward_mean, reward_std, time_taken)))
예제 #8
0
model_def = [64,64]

for task in reward_threshold.keys():
    TASK_NAME = task
    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
                                             name_prefix='rl_model')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100 * 150 /2, verbose=1)
    env = gym.make(TASK_NAME)

    log_dir = "./logs"

    env_m = monitor.Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env_m])
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    # Stop training when the model reaches the reward threshold
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold[TASK_NAME], verbose=1)
    eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1)
    callback = CallbackList([callback_max_episodes, eval_callback])
   
    model = A2C('MlpPolicy', env, verbose=1,policy_kwargs=dict(net_arch=model_def))
    st = time.time()
    model.learn(total_timesteps=100 * 150 * 10000, callback=callback)
    elapse_time = time.time() - st

    with open("./outdir/"+TASK_NAME + ".plt", "wb") as fd:
        chkpt = {
            "elapse_time": elapse_time,
            "reward_threshold" : reward_threshold,
            "reward_list" : env_m.get_episode_rewards(),
            "timestep_list": env_m.get_episode_lengths(),
            "runtime_list" : env_m.get_episode_times(),
예제 #9
0
def evaluate(individual: Individual,
             device: Union[torch.device, str] = "auto") -> Tuple[int]:
    """
    Evaluate a single individual model and return it's mean score after the training time is elapsed.
    Models are trained and evaluated for a number of timestamps as parameterized in the constants at the
    top of the file.
    :param individual: The individual to evaluate.
    :return:
    """

    t_start = time()
    layers = individual.weights
    name = individual.encode()

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)

    if os.path.exists(checkpoint_path):
        return (random.randint(MIN_SCORE, MAX_SCORE), )

    os.makedirs(checkpoint_path, exist_ok=True)
    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    if not os.path.exists(results_path):
        env_args = dict(
            frame_skip=4,
            screen_size=84,
            terminal_on_life_loss=True,
            clip_reward=True,
        )

        # Creates a gym environment for an atari game using the specified seed and number of environments
        # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
        # for improved performance..
        def atari_wrapper(env: gym.Env) -> gym.Env:
            env = AtariWrapper(env, **env_args)
            return env

        def make_env(rank: int, count: int) -> VecEnv:
            return make_vec_env(
                ENV_NAME,
                n_envs=count,
                seed=RANDOM_SEED + rank,
                start_index=0,
                monitor_dir=None,
                wrapper_class=atari_wrapper,
                env_kwargs=None,
                vec_env_cls=SubprocVecEnv,
                vec_env_kwargs=None,
                monitor_kwargs=None,
            )

        train_env = make_env(0, N_ENVS)
        eval_env = make_env(1, 1)

        # required by models in baselines
        train_env = VecTransposeImage(train_env)
        eval_env = VecTransposeImage(eval_env)

        # setup callback to save model at fixed intervals
        save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                           save_path=checkpoint_path,
                                           name_prefix=name)
        stop_callback = StopTrainingOnRewardThreshold(
            reward_threshold=EVAL_THRESHOLD)
        time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
        best_callback = EvalCallback(
            eval_env,
            eval_freq=EVAL_FREQ,
            best_model_save_path=checkpoint_path,
            callback_on_new_best=stop_callback,
        )
        list_callback = CallbackList(
            [save_callback, best_callback, time_callback])

        model = PPO(
            CnnPolicy,
            train_env,
            verbose=VERBOSE,
            batch_size=BATCH_SIZE,
            seed=RANDOM_SEED * 7,
            tensorboard_log=log_path,
            learning_rate=LEARNING_RATE,
            n_steps=UPDATE_STEPS,
            n_epochs=N_EPOCHS,
            ent_coef=ENT_COEF,
            vf_coef=VF_COEF,
            clip_range=CLIP_RANGE,
            device=device,
            policy_kwargs=dict(features_extractor_class=VariableBenchmark,
                               features_extractor_kwargs=dict(layers=layers)),
        )

        config_path = os.path.join(checkpoint_path, "cnn_config")
        zip_path = os.path.join(checkpoint_path, "model.zip")

        # output the model config to a file for easier viewing
        with open(config_path, "w") as file:
            file.write(f"{name}\n")
            file.write(str(model.policy.features_extractor.cnn))

        print("Beginning training...")

        model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
        model.save(zip_path)

        del train_env
        del eval_env

        time_taken = time() - t_start

        print("Beginning evaluation...")

        # score of the game, standard deviation of multiple runs
        reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

        with open(results_path, "w") as handle:
            handle.write(json.dumps((reward_mean, reward_std, time_taken)))
    else:
        reward_mean, reward_std, time_taken = json.load(open(
            results_path, "r"))

    reward_mean = abs(MIN_SCORE) + reward_mean
    value = (reward_mean * weighted_time(time_taken), )

    print(f"Evaluated {name} with a score of {value}  in {(time_taken):.2f}s")

    return value