def test_predict(self): # sample_num should > past_seq_len, the default value of which is 50 sample_num = 100 train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num) }) test_sample_num = 64 test_df = pd.DataFrame({ "datetime": pd.date_range('1/10/2019', periods=test_sample_num), "value": np.random.randn(test_sample_num) }) tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", extra_features_col=None, ) pipeline = tsp.fit(train_df) y_pred = pipeline.predict(test_df) default_past_seq_len = 50 assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, 2)
def __init__(self, horizon=1, dt_col="datetime", target_col="value", logs_dir="~/zoo_automl_logs", extra_features_col=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, name="automl"): """ Initialize the AutoTS Trainer. :param horizon: steps to look forward :param dt_col: the datetime column :param target_col: the target column to forecast :param extra_features_col: extra feature columns """ target_col_list = target_col if isinstance(target_col, str): target_col_list = [target_col] self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col_list, logs_dir=logs_dir, future_seq_len=horizon, extra_features_col=extra_features_col, search_alg=search_alg, search_alg_params=search_alg_params, scheduler=scheduler, scheduler_params=scheduler_params, name=name)
def test_fit_BayesRecipe(self): from zoo.automl.config.recipe import BayesRecipe train_df, _, future_seq_len = self.create_dataset() tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", future_seq_len=future_seq_len, extra_features_col=None, ) pipeline = tsp.fit(train_df, recipe=BayesRecipe(num_samples=1, training_iteration=2, epochs=1, look_back=(3, 5))) assert isinstance(pipeline, TimeSequencePipeline) assert isinstance(pipeline.feature_transformers, TimeSequenceFeatureTransformer) assert isinstance(pipeline.model, BaseModel) assert pipeline.config is not None assert "epochs" in pipeline.config assert [ config_name for config_name in pipeline.config if config_name.startswith('bayes_feature') ] == [] assert [ config_name for config_name in pipeline.config if config_name.endswith('float') ] == [] assert 'past_seq_len' in pipeline.config assert 3 <= pipeline.config["past_seq_len"] <= 5
def test_valuate(self): # sample_num should > past_seq_len, the default value of which is 50 sample_num = 100 train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num) }) sample_num = 64 test_df = pd.DataFrame({ "datetime": pd.date_range('1/10/2019', periods=sample_num), "value": np.random.randn(sample_num) }) tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", extra_features_col=None, ) pipeline = tsp.fit(train_df) print( "evaluate:", pipeline.evaluate(test_df, metric=["mean_squared_error", "r_square"]))
def main(train_path, pred_path, n_pred, dt, target, time_limit_min): os.environ["TRIALRUNNER_WALLTIME_LIMIT"] = str(time_limit_min * 60) df_train = pd.read_csv(train_path) df_train[dt] = pd.to_datetime(df_train[dt]) sc = init_spark_on_local(cores=mp.cpu_count(), spark_log_level="ERROR") ray_ctx = RayContext(sc=sc) ray_ctx.init() extra_features_col = list(set(df_train.columns) - set([dt, target])) if not extra_features_col: extra_features_col = None tsp = TimeSequencePredictor(dt_col=dt, target_col=target, extra_features_col=extra_features_col, future_seq_len=n_pred) pipeline = tsp.fit(df_train, resources_per_trial={"cpu": 4}, recipe=BayesRecipe(num_samples=100000)) df_pred = pipeline.predict(df_train[-2:]) x_pred = pd.date_range(df_pred.iloc[0][dt], periods=n_pred, freq=pd.infer_freq(df_train[dt])) y_pred = df_pred.iloc[0][1:] df_pred = pd.DataFrame({dt: x_pred, target: y_pred}) df_pred.to_csv(pred_path, index=False)
def test_fit_SmokeRecipe(self): train_df, validation_df, future_seq_len = self.create_dataset() tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", future_seq_len=future_seq_len, extra_features_col=None, ) pipeline = tsp.fit(train_df, validation_df) assert isinstance(pipeline, TimeSequencePipeline) assert isinstance(pipeline.feature_transformers, TimeSequenceFeatureTransformer) assert isinstance(pipeline.model, BaseModel) assert pipeline.config is not None
def get_input_tsp(self, future_seq_len, target_col): sample_num = np.random.randint(100, 200) test_sample_num = np.random.randint(20, 30) if isinstance(target_col, str): train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), target_col: np.random.randn(sample_num) }) test_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=test_sample_num), target_col: np.random.randn(test_sample_num) }) else: train_df = pd.DataFrame( {t: np.random.randn(sample_num) for t in target_col}) train_df["datetime"] = pd.date_range('1/1/2019', periods=sample_num) test_df = pd.DataFrame( {t: np.random.randn(test_sample_num) for t in target_col}) test_df["datetime"] = pd.date_range('1/1/2019', periods=test_sample_num) tsp = TimeSequencePredictor( dt_col="datetime", target_col=target_col, future_seq_len=future_seq_len, extra_features_col=None, ) return train_df, test_df, tsp, test_sample_num
class AutoTSTrainer: """ The Automated Time Series Forecast Trainer """ def __init__(self, horizon=1, dt_col="datetime", target_col="value", extra_features_col=None): """ Initialize the AutoTS Trainer. :param horizon: steps to look forward :param dt_col: the datetime column :param target_col: the target column to forecast :param extra_features_col: extra feature columns """ target_col_list = target_col if isinstance(target_col, str): target_col_list = [target_col] self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col_list, future_seq_len=horizon, extra_features_col=extra_features_col, ) def fit(self, train_df, validation_df=None, metric="mse", recipe: Recipe = SmokeRecipe(), uncertainty: bool = False, distributed: bool = False, hdfs_url=None): """ Fit a time series forecasting pipeline w/ automl :param train_df: the input dataframe (as pandas.dataframe) :param validation_df: the validation dataframe (as pandas.dataframe) :param recipe: the configuration of searching :param metric: the evaluation metric to optimize :param uncertainty: whether to enable uncertainty calculation (will output an uncertainty sigma) :param hdfs_url: the hdfs_url to use for storing trail and intermediate results :param distributed: whether to enable distributed training :return a TSPipeline """ zoo_pipeline = self.internal.fit(train_df, validation_df, metric, recipe, mc=uncertainty, distributed=distributed, hdfs_url=hdfs_url) ppl = TSPipeline() ppl.internal = zoo_pipeline return ppl
def __init__(self, horizon=1, dt_col="datetime", target_col="value", extra_features_col=None): """ Initialize the AutoTS Trainer. @param horizon: steps to look forward @param dt_col: the datetime column @param target_col: the target column to forecast @param extra_features_col: extra feature columns """ self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col, future_seq_len=horizon, extra_features_col=extra_features_col, )
def __init__(self, horizon=1, dt_col="datetime", target_col="value", extra_features_col=None): """ Initialize the AutoTS Trainer. :param horizon: steps to look forward :param dt_col: the datetime column :param target_col: the target column to forecast :param extra_features_col: extra feature columns """ target_col_list = target_col if isinstance(target_col, str): target_col_list = [target_col] self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col_list, future_seq_len=horizon, extra_features_col=extra_features_col, )
def _hp_search( self, input_df, validation_df, config, metric, recipe, mc, resources_per_trial, remote_dir, ): def model_create_func(): _model = XGBoost(model_type=self.model_type, config=config) if "cpu" in resources_per_trial: _model.set_params(n_jobs=resources_per_trial.get("cpu")) return _model model = model_create_func() ft = IdentityTransformer(self.feature_cols, self.target_col) # prepare parameters for search engine search_space = recipe.search_space(None) from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor metric_mode = TimeSequencePredictor._get_metric_mode(metric) searcher = RayTuneSearchEngine( logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile( input_df, model_create_func=model_create_func(), search_space=search_space, recipe=recipe, feature_transformers=ft, validation_df=validation_df, metric=metric, metric_mode=metric_mode, mc=mc, ) # searcher.test_run() analysis = searcher.run() pipeline = self._make_pipeline(analysis, metric_mode, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def _hp_search(self, input_df, validation_df, config, metric, recipe, mc, resources_per_trial, remote_dir): def model_create_func(): model = XGBoostRegressor(config) model.set_params(n_jobs=resources_per_trial) return model model = model_create_func() ft = IdentityTransformer(self.feature_cols, self.target_col) # prepare parameters for search engine search_space = recipe.search_space(None) runtime_params = recipe.runtime_params() num_samples = runtime_params['num_samples'] stop = dict(runtime_params) search_algorithm_params = recipe.search_algorithm_params() search_algorithm = recipe.search_algorithm() fixed_params = recipe.fixed_params() del stop['num_samples'] from zoo.automl.regression.time_sequence_predictor import TimeSequencePredictor metric_mode = TimeSequencePredictor._get_metric_mode(metric) searcher = RayTuneSearchEngine( logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile(input_df, model_create_func=model_create_func(), search_space=search_space, stop=stop, search_algorithm_params=search_algorithm_params, search_algorithm=search_algorithm, fixed_params=fixed_params, feature_transformers=ft, validation_df=validation_df, metric=metric, metric_mode=metric_mode, mc=mc, num_samples=num_samples) # searcher.test_run() analysis = searcher.run() pipeline = self._make_pipeline(analysis, metric_mode, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def test_fit_LSTMGridRandomRecipe(self): from zoo.automl.config.recipe import LSTMGridRandomRecipe train_df, _, future_seq_len = self.create_dataset() tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", future_seq_len=future_seq_len, extra_features_col=None, ) pipeline = tsp.fit(train_df, recipe=LSTMGridRandomRecipe(lstm_2_units=[4], batch_size=[1024], num_rand_samples=5, look_back=2, training_iteration=1, epochs=1)) assert isinstance(pipeline, TimeSequencePipeline) assert isinstance(pipeline.feature_transformers, TimeSequenceFeatureTransformer) assert isinstance(pipeline.model, BaseModel) assert pipeline.config is not None assert 'past_seq_len' in pipeline.config assert pipeline.config["past_seq_len"] == 2
def test_save_restore(self): sample_num = 100 train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num) }) sample_num = 64 test_df = pd.DataFrame({ "datetime": pd.date_range('1/10/2019', periods=sample_num), "value": np.random.randn(sample_num) }) tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", extra_features_col=None, ) pipeline = tsp.fit(train_df) pred = pipeline.predict(test_df) dirname = tempfile.mkdtemp(prefix="saved_pipeline") try: save_pipeline_file = dirname pipeline.save(save_pipeline_file) new_pipeline = TimeSequencePipeline() new_pipeline.restore(save_pipeline_file) new_pred = new_pipeline.predict(test_df) np.testing.assert_allclose(pred["value"].values, new_pred["value"].values) finally: shutil.rmtree(dirname)
class AutoTSTrainer: """ The Automated Time Series Forecast Trainer """ def __init__(self, horizon=1, dt_col="datetime", target_col="value", logs_dir="~/zoo_automl_logs", extra_features_col=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, name="automl"): """ Initialize the AutoTS Trainer. :param horizon: steps to look forward :param dt_col: the datetime column :param target_col: the target column to forecast :param extra_features_col: extra feature columns """ target_col_list = target_col if isinstance(target_col, str): target_col_list = [target_col] self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col_list, logs_dir=logs_dir, future_seq_len=horizon, extra_features_col=extra_features_col, search_alg=search_alg, search_alg_params=search_alg_params, scheduler=scheduler, scheduler_params=scheduler_params, name=name) def fit( self, train_df, validation_df=None, metric="mse", recipe: Recipe = SmokeRecipe(), uncertainty: bool = False, upload_dir=None, ): """ Fit a time series forecasting pipeline w/ automl :param train_df: the input dataframe (as pandas.dataframe) :param validation_df: the validation dataframe (as pandas.dataframe) :param recipe: the configuration of searching :param metric: the evaluation metric to optimize :param uncertainty: whether to enable uncertainty calculation (will output an uncertainty sigma) :param upload_dir: Optional URI to sync training results and checkpoints. We only support hdfs URI for now. :return a TSPipeline """ zoo_pipeline = self.internal.fit(train_df, validation_df, metric, recipe, mc=uncertainty, upload_dir=upload_dir) ppl = TSPipeline() ppl.internal = zoo_pipeline return ppl