Пример #1
0
    def train_model(self):

        auto_save_callback = SaveOnBestTrainingRewardCallback(
            log_dir=self.log_dir)
        auto_save_callback_every_1000_steps = EveryNTimesteps(
            n_steps=1000, callback=auto_save_callback)

        self.environment = Monitor(self.environment, self.log_dir)
        self.model = self.algorithm('MlpPolicy',
                                    self.environment,
                                    verbose=1,
                                    tensorboard_log=self.log_dir)

        name = self.model_name + "_full_model"
        checkpoint_callback = SavePerformanceOnCheckpoints(
            resource_manager=self,
            name=name,
            checkpoint_results=self.checkpoint_results)
        checkpoint_callback_every_1000_steps = EveryNTimesteps(
            n_steps=1000, callback=checkpoint_callback)

        with ProgressBarManager(self.training_steps) as progress_callback:
            self.model.learn(total_timesteps=self.training_steps,
                             callback=[
                                 progress_callback,
                                 auto_save_callback_every_1000_steps,
                                 checkpoint_callback_every_1000_steps
                             ])

        self.save_episode_rewards_as_csv()
        model_path = os.path.abspath("models/" + name)
        self.model.save(model_path)
Пример #2
0
    def train_stage2_model(self,
                           environment_kwargs=None,
                           policy_kwargs=None,
                           hyperparams=None,
                           training_steps=20000,
                           model_name="",
                           stage1_time=0):

        config = {
            "verbose": 1,
            "tensorboard_log": self.log_dir,
            "policy_kwargs": policy_kwargs
        }

        if hyperparams is not None:
            for key in hyperparams.keys():
                config[key] = hyperparams[key]

        print(environment_kwargs)
        environment = ResourceAllocationEnvironment(self.ra_problem,
                                                    **environment_kwargs)
        environment = Monitor(environment, self.log_dir)
        self.environment = environment

        model = PPO(MultiStageActorCritic, environment, **config)
        with ProgressBarManager(training_steps) as progress_callback:
            auto_save_callback = SaveOnBestTrainingRewardCallback(
                log_dir=self.log_dir)
            auto_save_callback_every_1000_steps = EveryNTimesteps(
                n_steps=1000, callback=auto_save_callback)
            checkpoint_callback = SavePerformanceOnCheckpoints(
                stage1_time=stage1_time,
                resource_manager=self,
                name=model_name,
                checkpoint_results=self.checkpoint_results)
            checkpoint_callback_every_1000_steps = EveryNTimesteps(
                n_steps=1000, callback=checkpoint_callback)
            callbacks = [
                progress_callback, auto_save_callback_every_1000_steps,
                checkpoint_callback_every_1000_steps
            ]
            model.learn(total_timesteps=training_steps, callback=callbacks)

        self.environment = environment

        self.save_episode_rewards_as_csv()

        full_model_path = os.path.abspath("models/" + self.model_name +
                                          "_full_model")
        model.save(full_model_path)

        return model
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # Dyn only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1)

    eval_callback = EvalCallback(
        eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100
    )

    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event")
    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    callback = CallbackList([checkpoint_callback, eval_callback, event_callback])

    model.learn(500, callback=callback)
    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)
    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
Пример #4
0
    def train_adp_model(self, regional_policies=None, setup_start=0.):
        regions = self.regions
        environment = ADPResourceAllocationEnvironment(
            self.ra_problem,
            regions,
            regional_policies,
            abstract_action_to_direction=self.abstract_action_to_direction,
            n_locked_tasks=self.n_locked_tasks,
            n_abstract_actions=self.n_abstract_actions)
        environment = Monitor(environment, self.log_dir)
        self.environment = environment
        adp_model = self.algorithm('MlpPolicy',
                                   environment,
                                   verbose=1,
                                   tensorboard_log=self.log_dir)
        self.model = adp_model

        auto_save_callback = SaveOnBestTrainingRewardCallback(
            log_dir=self.log_dir)
        auto_save_callback_every_1000_steps = EveryNTimesteps(
            n_steps=1000, callback=auto_save_callback)

        name = self.model_name + "_full_model_multi"
        setup_time = time.time() - setup_start
        checkpoint_callback = SavePerformanceOnCheckpoints(
            stage1_time=setup_time,
            resource_manager=self,
            name=name,
            checkpoint_results=self.checkpoint_results)
        checkpoint_callback_every_1000_steps = EveryNTimesteps(
            n_steps=1000, callback=checkpoint_callback)

        training_steps = self.training_config["stage2_training_steps"]
        with ProgressBarManager(training_steps) as progress_callback:
            adp_model.learn(total_timesteps=training_steps,
                            callback=[
                                progress_callback,
                                auto_save_callback_every_1000_steps,
                                checkpoint_callback_every_1000_steps
                            ])

        self.save_episode_rewards_as_csv()

        full_model_path = os.path.abspath("models/" + self.model_name +
                                          "_full_model")
        adp_model.save(full_model_path)
Пример #5
0
        def evaluate_objective(config):
            tune_env = deepcopy(base_env)
            tune_monitor = OptimizationCallback(tune_env, EVAL_EPISODES, True)
            monitor_callback = EveryNTimesteps(n_steps=args.report_interval,
                                               callback=tune_monitor)

            tune_agent = agent("MlpPolicy", tune_env, **config)
            tune_agent.learn(total_timesteps=args.sample_timesteps,
                             callback=monitor_callback)
    def fit(self,
            env,
            episodes,
            verbose,
            episode_steps,
            callbacks,
            log_interval,
            agent_id=-1):
        """Mask the agent fit function
        To train the agent
        """
        logger.info("herer")
        # self.model.learn(total_timesteps=100, log_interval=10)
        #FIXME: use the tb logname meaningful!

        #TODO: Write callback funcs here:
        # List of callback:
        # Checkpoint Callback: save the model every 10 episodes.
        checkpoint_callback = CheckpointCallback(
            save_freq=96,
            save_path=self.agent_helper.config_dir,
            name_prefix='rl_model')
        # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path.
        eval_env = env
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/',
                                     eval_freq=500,
                                     deterministic=True,
                                     render=False)
        # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70,
                                                         verbose=1)
        eval_callback_reward_threshold = EvalCallback(
            eval_env, callback_on_new_best=callback_on_best, verbose=1)
        # EveryNTimeSteps: to call every n time steps to save the model.
        checkpoint_on_event = CheckpointCallback(save_freq=1,
                                                 save_path='./logs/')
        event_callback_after_n_steps = EveryNTimesteps(
            n_steps=500, callback=checkpoint_on_event)

        # StopTrainingOnMaxEpisodes:
        # Stops training when the model reaches the maximum number of episodes
        callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5,
                                                          verbose=1)

        # CallbackList: to call several callback together.
        callbacklist = CallbackList([checkpoint_callback, eval_callback])

        logger.info(f"Model: {self.model.get_env()}")
        with ProgressBarManager(log_interval) as progress_callback:
            self.model.learn(total_timesteps=log_interval,
                             callback=[progress_callback, checkpoint_callback])
        # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        # self.eval_writer(mean_reward, std_reward)
        pass
Пример #7
0
def main(cfg: DictConfig):
    env = get_env(None, cfg.env)
    model = DQN(MlpPolicy,
                env,
                **cfg.model,
                tensorboard_log='logs/',
                verbose=1)

    callbacks = [TensorboardCallback()]
    if cfg.self_play:
        self_play = EveryNTimesteps(cfg.n_update_selfplay, callback=SelfPlay('ckpts/', cfg.env))
        callbacks.append(self_play)
    if cfg.ckpt_freq:
        ckpt_cb = CheckpointCallback(save_freq=cfg.ckpt_freq, save_path='ckpts/')
        callbacks.append(ckpt_cb)

    model.learn(total_timesteps=cfg.n_total_steps, callback=callbacks, tb_log_name=cfg.log_name)
def runner(agent, episode, checkpoint, env):
    # scores = np.genfromtxt(checkpoint+'/data.csv', delimiter=',')
    # checkpoint2 = checkpoint+'2'
    custom_callback = LoggerCallback(episode, checkpoint=checkpoint)
    checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=checkpoint,
                                            name_prefix='rl_model')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=episode, verbose=1)
    event_callback = EveryNTimesteps(n_steps=1, callback=custom_callback)
    # load = os.path.abspath(checkpoint+'/rl_model_676000_steps')
    # print(load)
    # agent = DDPG.load(load, env)
    callback_list = CallbackList([event_callback, checkpoint_callback, callback_max_episodes])
    # agent.learn(total_timesteps=100000000, callback=callback_list, reward_function=reward)
    agent.learn(total_timesteps=100000000, callback=callback_list)
    scores = custom_callback.rewards
    np.savetxt(checkpoint+'/data.csv', scores, delimiter=',')

    return scores
Пример #9
0
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy",
                        env_name,
                        policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(
        eval_env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=log_folder,
        log_path=log_folder,
        eval_freq=100,
        warn=False,
    )
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=log_folder,
                                             name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100,
                                                      verbose=1)

    callback = CallbackList([
        checkpoint_callback, eval_callback, event_callback,
        callback_max_episodes
    ])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy",
                            envs,
                            policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(
            max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
def main():
    if(StartFresh):
        # Create Environment
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto")
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])])

        model = PPO('MlpPolicy', 
            env, 
            learning_rate = 3e-5,
            n_steps=512,
            batch_size=128,
            n_epochs=20,
            gamma=0.99,
            gae_lambda = 0.9,
            clip_range = 0.4,
            vf_coef = 0.5,
            use_sde = True,
            sde_sample_freq = 4,
            policy_kwargs = policy_kwargs, 
            verbose=1, 
            tensorboard_log=tb_log,
            device="auto")


    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        # nStep_callback_list = CallbackList([envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])
        # callbacks = CallbackList([checkpoint_callback, eval_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks)

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
Пример #11
0
                            policy_kwargs=policy_kwargs,
                            verbose=1)

            else:
                # The noise objects for DDPG
                n_actions = env.action_space.shape[-1]
                # action_noise = NormalActionNoise(mean=np.zeros(
                # n_actions), sigma=args.action_noise * np.ones(n_actions))

                # model = DDPG('MlpPolicy', env, action_noise=action_noise, batch_size=args.batch_size,
                #  buffer_size=args.buffer_size, gamma=args.gamma, policy_kwargs=policy_kwargs, verbose=1)

        checkpoint_on_event = CheckpointCallback(save_freq=1,
                                                 name_prefix=get_params_str(
                                                     args.seed),
                                                 save_path='./checkpoints/')
        event_callback = EveryNTimesteps(n_steps=args.checkpoint_every,
                                         callback=checkpoint_on_event)

        model.learn(total_timesteps=args.total_timesteps,
                    log_interval=1,
                    callback=event_callback)

        if (args.save_to):
            model.save(args.save_to)
        else:
            model.save("saved_models/" +
                       get_params_str(f"envSeed-{args.seed}"))

    if args.evaluate_for:
        evaluate(model, env)
Пример #12
0
def main():
    if(StartFresh):
        # Create Environment
        env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log)

    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks) #, callback=callback, =TensorboardCallback()

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)