示例#1
0
    def test_predict(self):
        # sample_num should > past_seq_len, the default value of which is 50
        sample_num = 100
        train_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })
        test_sample_num = 64
        test_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/10/2019', periods=test_sample_num),
            "value":
            np.random.randn(test_sample_num)
        })

        tsp = TimeSequencePredictor(
            dt_col="datetime",
            target_col="value",
            extra_features_col=None,
        )
        pipeline = tsp.fit(train_df)
        y_pred = pipeline.predict(test_df)

        default_past_seq_len = 50
        assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, 2)
示例#2
0
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 logs_dir="~/zoo_automl_logs",
                 extra_features_col=None,
                 search_alg=None,
                 search_alg_params=None,
                 scheduler=None,
                 scheduler_params=None,
                 name="automl"):
        """
        Initialize the AutoTS Trainer.

        :param horizon: steps to look forward
        :param dt_col: the datetime column
        :param target_col: the target column to forecast
        :param extra_features_col: extra feature columns
        """
        target_col_list = target_col
        if isinstance(target_col, str):
            target_col_list = [target_col]
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col_list,
            logs_dir=logs_dir,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
            search_alg=search_alg,
            search_alg_params=search_alg_params,
            scheduler=scheduler,
            scheduler_params=scheduler_params,
            name=name)
 def test_fit_BayesRecipe(self):
     from zoo.automl.config.recipe import BayesRecipe
     train_df, _, future_seq_len = self.create_dataset()
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col="value",
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     pipeline = tsp.fit(train_df,
                        recipe=BayesRecipe(num_samples=1,
                                           training_iteration=2,
                                           epochs=1,
                                           look_back=(3, 5)))
     assert isinstance(pipeline, TimeSequencePipeline)
     assert isinstance(pipeline.feature_transformers,
                       TimeSequenceFeatureTransformer)
     assert isinstance(pipeline.model, BaseModel)
     assert pipeline.config is not None
     assert "epochs" in pipeline.config
     assert [
         config_name for config_name in pipeline.config
         if config_name.startswith('bayes_feature')
     ] == []
     assert [
         config_name for config_name in pipeline.config
         if config_name.endswith('float')
     ] == []
     assert 'past_seq_len' in pipeline.config
     assert 3 <= pipeline.config["past_seq_len"] <= 5
示例#4
0
    def test_valuate(self):
        # sample_num should > past_seq_len, the default value of which is 50
        sample_num = 100
        train_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })
        sample_num = 64
        test_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/10/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })

        tsp = TimeSequencePredictor(
            dt_col="datetime",
            target_col="value",
            extra_features_col=None,
        )
        pipeline = tsp.fit(train_df)
        print(
            "evaluate:",
            pipeline.evaluate(test_df,
                              metric=["mean_squared_error", "r_square"]))
示例#5
0
def main(train_path, pred_path, n_pred, dt, target, time_limit_min):
    os.environ["TRIALRUNNER_WALLTIME_LIMIT"] = str(time_limit_min * 60)

    df_train = pd.read_csv(train_path)
    df_train[dt] = pd.to_datetime(df_train[dt])

    sc = init_spark_on_local(cores=mp.cpu_count(), spark_log_level="ERROR")
    ray_ctx = RayContext(sc=sc)
    ray_ctx.init()

    extra_features_col = list(set(df_train.columns) - set([dt, target]))
    if not extra_features_col:
        extra_features_col = None
    tsp = TimeSequencePredictor(dt_col=dt,
                                target_col=target,
                                extra_features_col=extra_features_col,
                                future_seq_len=n_pred)
    pipeline = tsp.fit(df_train,
                       resources_per_trial={"cpu": 4},
                       recipe=BayesRecipe(num_samples=100000))

    df_pred = pipeline.predict(df_train[-2:])
    x_pred = pd.date_range(df_pred.iloc[0][dt],
                           periods=n_pred,
                           freq=pd.infer_freq(df_train[dt]))
    y_pred = df_pred.iloc[0][1:]
    df_pred = pd.DataFrame({dt: x_pred, target: y_pred})
    df_pred.to_csv(pred_path, index=False)
 def test_fit_SmokeRecipe(self):
     train_df, validation_df, future_seq_len = self.create_dataset()
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col="value",
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     pipeline = tsp.fit(train_df, validation_df)
     assert isinstance(pipeline, TimeSequencePipeline)
     assert isinstance(pipeline.feature_transformers,
                       TimeSequenceFeatureTransformer)
     assert isinstance(pipeline.model, BaseModel)
     assert pipeline.config is not None
 def get_input_tsp(self, future_seq_len, target_col):
     sample_num = np.random.randint(100, 200)
     test_sample_num = np.random.randint(20, 30)
     if isinstance(target_col, str):
         train_df = pd.DataFrame({
             "datetime":
             pd.date_range('1/1/2019', periods=sample_num),
             target_col:
             np.random.randn(sample_num)
         })
         test_df = pd.DataFrame({
             "datetime":
             pd.date_range('1/1/2019', periods=test_sample_num),
             target_col:
             np.random.randn(test_sample_num)
         })
     else:
         train_df = pd.DataFrame(
             {t: np.random.randn(sample_num)
              for t in target_col})
         train_df["datetime"] = pd.date_range('1/1/2019',
                                              periods=sample_num)
         test_df = pd.DataFrame(
             {t: np.random.randn(test_sample_num)
              for t in target_col})
         test_df["datetime"] = pd.date_range('1/1/2019',
                                             periods=test_sample_num)
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col=target_col,
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     return train_df, test_df, tsp, test_sample_num
示例#8
0
class AutoTSTrainer:
    """
    The Automated Time Series Forecast Trainer
    """
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 extra_features_col=None):
        """
        Initialize the AutoTS Trainer.

        :param horizon: steps to look forward
        :param dt_col: the datetime column
        :param target_col: the target column to forecast
        :param extra_features_col: extra feature columns
        """
        target_col_list = target_col
        if isinstance(target_col, str):
            target_col_list = [target_col]
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col_list,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
        )

    def fit(self,
            train_df,
            validation_df=None,
            metric="mse",
            recipe: Recipe = SmokeRecipe(),
            uncertainty: bool = False,
            distributed: bool = False,
            hdfs_url=None):
        """
        Fit a time series forecasting pipeline w/ automl
        :param train_df: the input dataframe (as pandas.dataframe)
        :param validation_df: the validation dataframe (as pandas.dataframe)
        :param recipe: the configuration of searching
        :param metric: the evaluation metric to optimize
        :param uncertainty: whether to enable uncertainty calculation
                            (will output an uncertainty sigma)
        :param hdfs_url: the hdfs_url to use for storing trail and intermediate results
        :param distributed: whether to enable distributed training
        :return a TSPipeline
        """
        zoo_pipeline = self.internal.fit(train_df,
                                         validation_df,
                                         metric,
                                         recipe,
                                         mc=uncertainty,
                                         distributed=distributed,
                                         hdfs_url=hdfs_url)
        ppl = TSPipeline()
        ppl.internal = zoo_pipeline
        return ppl
示例#9
0
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 extra_features_col=None):
        """
        Initialize the AutoTS Trainer.

        @param horizon: steps to look forward
        @param dt_col: the datetime column
        @param target_col: the target column to forecast
        @param extra_features_col: extra feature columns
        """
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
        )
示例#10
0
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 extra_features_col=None):
        """
        Initialize the AutoTS Trainer.

        :param horizon: steps to look forward
        :param dt_col: the datetime column
        :param target_col: the target column to forecast
        :param extra_features_col: extra feature columns
        """
        target_col_list = target_col
        if isinstance(target_col, str):
            target_col_list = [target_col]
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col_list,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
        )
示例#11
0
    def _hp_search(
        self,
        input_df,
        validation_df,
        config,
        metric,
        recipe,
        mc,
        resources_per_trial,
        remote_dir,
    ):
        def model_create_func():
            _model = XGBoost(model_type=self.model_type, config=config)
            if "cpu" in resources_per_trial:
                _model.set_params(n_jobs=resources_per_trial.get("cpu"))
            return _model

        model = model_create_func()
        ft = IdentityTransformer(self.feature_cols, self.target_col)

        # prepare parameters for search engine
        search_space = recipe.search_space(None)

        from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor
        metric_mode = TimeSequencePredictor._get_metric_mode(metric)
        searcher = RayTuneSearchEngine(
            logs_dir=self.logs_dir,
            resources_per_trial=resources_per_trial,
            name=self.name,
            remote_dir=remote_dir,
        )
        searcher.compile(
            input_df,
            model_create_func=model_create_func(),
            search_space=search_space,
            recipe=recipe,
            feature_transformers=ft,
            validation_df=validation_df,
            metric=metric,
            metric_mode=metric_mode,
            mc=mc,
        )
        # searcher.test_run()
        analysis = searcher.run()

        pipeline = self._make_pipeline(analysis,
                                       metric_mode,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
    def _hp_search(self, input_df, validation_df, config, metric, recipe, mc,
                   resources_per_trial, remote_dir):
        def model_create_func():
            model = XGBoostRegressor(config)
            model.set_params(n_jobs=resources_per_trial)
            return model

        model = model_create_func()
        ft = IdentityTransformer(self.feature_cols, self.target_col)

        # prepare parameters for search engine
        search_space = recipe.search_space(None)
        runtime_params = recipe.runtime_params()
        num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        search_algorithm_params = recipe.search_algorithm_params()
        search_algorithm = recipe.search_algorithm()
        fixed_params = recipe.fixed_params()
        del stop['num_samples']

        from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor
        metric_mode = TimeSequencePredictor._get_metric_mode(metric)
        searcher = RayTuneSearchEngine(
            logs_dir=self.logs_dir,
            resources_per_trial=resources_per_trial,
            name=self.name,
            remote_dir=remote_dir,
        )
        searcher.compile(input_df,
                         model_create_func=model_create_func(),
                         search_space=search_space,
                         stop=stop,
                         search_algorithm_params=search_algorithm_params,
                         search_algorithm=search_algorithm,
                         fixed_params=fixed_params,
                         feature_transformers=ft,
                         validation_df=validation_df,
                         metric=metric,
                         metric_mode=metric_mode,
                         mc=mc,
                         num_samples=num_samples)
        # searcher.test_run()
        analysis = searcher.run()

        pipeline = self._make_pipeline(analysis,
                                       metric_mode,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
 def test_fit_LSTMGridRandomRecipe(self):
     from zoo.automl.config.recipe import LSTMGridRandomRecipe
     train_df, _, future_seq_len = self.create_dataset()
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col="value",
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     pipeline = tsp.fit(train_df,
                        recipe=LSTMGridRandomRecipe(lstm_2_units=[4],
                                                    batch_size=[1024],
                                                    num_rand_samples=5,
                                                    look_back=2,
                                                    training_iteration=1,
                                                    epochs=1))
     assert isinstance(pipeline, TimeSequencePipeline)
     assert isinstance(pipeline.feature_transformers,
                       TimeSequenceFeatureTransformer)
     assert isinstance(pipeline.model, BaseModel)
     assert pipeline.config is not None
     assert 'past_seq_len' in pipeline.config
     assert pipeline.config["past_seq_len"] == 2
示例#14
0
    def test_save_restore(self):
        sample_num = 100
        train_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })
        sample_num = 64
        test_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/10/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })

        tsp = TimeSequencePredictor(
            dt_col="datetime",
            target_col="value",
            extra_features_col=None,
        )
        pipeline = tsp.fit(train_df)
        pred = pipeline.predict(test_df)

        dirname = tempfile.mkdtemp(prefix="saved_pipeline")
        try:
            save_pipeline_file = dirname
            pipeline.save(save_pipeline_file)

            new_pipeline = TimeSequencePipeline()
            new_pipeline.restore(save_pipeline_file)

            new_pred = new_pipeline.predict(test_df)
            np.testing.assert_allclose(pred["value"].values,
                                       new_pred["value"].values)
        finally:
            shutil.rmtree(dirname)
示例#15
0
class AutoTSTrainer:
    """
    The Automated Time Series Forecast Trainer
    """
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 logs_dir="~/zoo_automl_logs",
                 extra_features_col=None,
                 search_alg=None,
                 search_alg_params=None,
                 scheduler=None,
                 scheduler_params=None,
                 name="automl"):
        """
        Initialize the AutoTS Trainer.

        :param horizon: steps to look forward
        :param dt_col: the datetime column
        :param target_col: the target column to forecast
        :param extra_features_col: extra feature columns
        """
        target_col_list = target_col
        if isinstance(target_col, str):
            target_col_list = [target_col]
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col_list,
            logs_dir=logs_dir,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
            search_alg=search_alg,
            search_alg_params=search_alg_params,
            scheduler=scheduler,
            scheduler_params=scheduler_params,
            name=name)

    def fit(
        self,
        train_df,
        validation_df=None,
        metric="mse",
        recipe: Recipe = SmokeRecipe(),
        uncertainty: bool = False,
        upload_dir=None,
    ):
        """
        Fit a time series forecasting pipeline w/ automl
        :param train_df: the input dataframe (as pandas.dataframe)
        :param validation_df: the validation dataframe (as pandas.dataframe)
        :param recipe: the configuration of searching
        :param metric: the evaluation metric to optimize
        :param uncertainty: whether to enable uncertainty calculation
                            (will output an uncertainty sigma)
        :param upload_dir: Optional URI to sync training results and checkpoints. We only support
            hdfs URI for now.
        :return a TSPipeline
        """
        zoo_pipeline = self.internal.fit(train_df,
                                         validation_df,
                                         metric,
                                         recipe,
                                         mc=uncertainty,
                                         upload_dir=upload_dir)
        ppl = TSPipeline()
        ppl.internal = zoo_pipeline
        return ppl