예제 #1
0
    def _hp_search(self,
                   input_df,
                   validation_df,
                   metric,
                   recipe,
                   mc,
                   resources_per_trial,
                   remote_dir):

        ft = TimeSequenceFeatureTransformer(self.future_seq_len,
                                            self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)
        if isinstance(input_df, list):
            feature_list = ft.get_feature_list(input_df[0])
        else:
            feature_list = ft.get_feature_list(input_df)

        # model = VanillaLSTM(check_optional_config=False)
        model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len)

        # prepare parameters for search engine
        search_space = recipe.search_space(feature_list)
        runtime_params = recipe.runtime_params()
        num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        search_algorithm_params = recipe.search_algorithm_params()
        search_algorithm = recipe.search_algorithm()
        fixed_params = recipe.fixed_params()
        del stop['num_samples']

        searcher = RayTuneSearchEngine(logs_dir=self.logs_dir,
                                       resources_per_trial=resources_per_trial,
                                       name=self.name,
                                       remote_dir=remote_dir,
                                       )
        searcher.compile(input_df,
                         search_space=search_space,
                         stop=stop,
                         search_algorithm_params=search_algorithm_params,
                         search_algorithm=search_algorithm,
                         fixed_params=fixed_params,
                         # feature_transformers=TimeSequenceFeatures,
                         feature_transformers=ft,
                         # model=model,
                         future_seq_len=self.future_seq_len,
                         validation_df=validation_df,
                         metric=metric,
                         mc=mc,
                         num_samples=num_samples)
        # searcher.test_run()
        searcher.run()

        best = searcher.get_best_trials(k=1)[0]  # get the best one trial, later could be n
        pipeline = self._make_pipeline(best,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
예제 #2
0
    def setup_method(self, method):
        # super().setup_method(method)
        train_data = pd.DataFrame(data=np.random.randn(64, 4))
        val_data = pd.DataFrame(data=np.random.randn(16, 4))
        test_data = pd.DataFrame(data=np.random.randn(16, 4))

        future_seq_len = 1
        past_seq_len = 6

        # use roll method in time_sequence
        tsft = TimeSequenceFeatureTransformer()
        self.x_train, self.y_train = tsft._roll_train(
            train_data,
            past_seq_len=past_seq_len,
            future_seq_len=future_seq_len)
        self.x_val, self.y_val = tsft._roll_train(
            val_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len)
        self.x_test = tsft._roll_test(test_data, past_seq_len=past_seq_len)
        self.config = {
            'epochs': 1,
            "lr": 0.001,
            "lstm_1_units": 16,
            "dropout_1": 0.2,
            "lstm_2_units": 10,
            "dropout_2": 0.2,
            "batch_size": 32,
        }
        self.model = VanillaLSTM(check_optional_config=False,
                                 future_seq_len=future_seq_len)
예제 #3
0
    def test_predict(self):
        train_data = pd.DataFrame(data=np.random.randn(64, 4))
        test_data = pd.DataFrame(data=np.random.randn(16, 4))
        future_seq_len = 1
        past_seq_len = 6

        # use roll method in time_sequence
        tsft = TimeSequenceFeatureTransformer()
        x_train, y_train = tsft._roll_train(train_data,
                                            past_seq_len=past_seq_len,
                                            future_seq_len=future_seq_len)
        x_test = tsft._roll_test(test_data, past_seq_len=past_seq_len)

        config = {
            'epochs': 2,
            "lr": 0.001,
            "lstm_1_units": 16,
            "dropout_1": 0.2,
            "lstm_2_units": 10,
            "dropout_2": 0.2,
            "batch_size": 32,
        }
        model = VanillaLSTM(check_optional_config=False,
                            future_seq_len=future_seq_len)
        model.fit_eval(x_train, y_train, **config)
        y_pred = model.predict(x_test)
        assert y_pred.shape == (x_test.shape[0], 1)
예제 #4
0
 def test_evaluate(self):
     train_data = pd.DataFrame(data=np.random.randn(64, 4))
     val_data = pd.DataFrame(data=np.random.randn(16, 4))
     future_seq_len = 1
     past_seq_len = 6
     # use roll method in time_sequence
     tsft = TimeSequenceFeatureTransformer()
     x_train, y_train = tsft._roll_train(train_data,
                                         past_seq_len=past_seq_len,
                                         future_seq_len=future_seq_len)
     x_val, y_val = tsft._roll_train(val_data,
                                     past_seq_len=past_seq_len,
                                     future_seq_len=future_seq_len)
     config = {
         'epochs': 1,
         "lr": 0.001,
         "lstm_1_units": 16,
         "dropout_1": 0.2,
         "lstm_2_units": 10,
         "dropout_2": 0.2,
         "batch_size": 32,
     }
     model = VanillaLSTM(check_optional_config=False,
                         future_seq_len=future_seq_len)
     model.fit_eval(x_train, y_train, **config)
     print("evaluate:", model.evaluate(x_val, y_val))
예제 #5
0
 def setup_method(self, method):
     tf.keras.backend.clear_session()
     self.ft = TimeSequenceFeatureTransformer()
     self.create_data()
     self.model = MTNetKeras()
     self.config = {"long_num": self.long_num,
                    "time_step": self.time_step,
                    "ar_window": np.random.randint(1, 3),
                    "cnn_height": np.random.randint(1, 3),
                    "epochs": 1}
예제 #6
0
class TestZouwuModelMTNetForecaster(TestCase):
    def setUp(self):
        tf.keras.backend.clear_session()
        self.ft = TimeSequenceFeatureTransformer()
        self.create_data()

    def tearDown(self):
        pass

    def create_data(self):
        def gen_train_sample(data, past_seq_len, future_seq_len):
            data = pd.DataFrame(data)
            x, y = self.ft._roll_train(data,
                                       past_seq_len=past_seq_len,
                                       future_seq_len=future_seq_len)
            return x, y

        def gen_test_sample(data, past_seq_len):
            test_data = pd.DataFrame(data)
            x = self.ft._roll_test(test_data, past_seq_len=past_seq_len)
            return x

        self.long_num = 6
        self.time_step = 2
        look_back = (self.long_num + 1) * self.time_step
        look_forward = 1
        self.x_train, self.y_train = gen_train_sample(
            data=np.random.randn(64, 4),
            past_seq_len=look_back,
            future_seq_len=look_forward)
        self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4),
                                                  past_seq_len=look_back,
                                                  future_seq_len=look_forward)
        self.x_test = gen_test_sample(data=np.random.randn(16, 4),
                                      past_seq_len=look_back)

    def test_forecast_mtnet(self):
        # TODO hacking to fix a bug
        target_dim = 1
        model = MTNetForecaster(target_dim=target_dim,
                                feature_dim=self.x_train.shape[-1],
                                long_series_num=self.long_num,
                                series_length=self.time_step)
        x_train_long, x_train_short = model.preprocess_input(self.x_train)
        x_val_long, x_val_short = model.preprocess_input(self.x_val)
        x_test_long, x_test_short = model.preprocess_input(self.x_test)

        model.fit([x_train_long, x_train_short],
                  self.y_train,
                  validation_data=([x_val_long, x_val_short], self.y_val),
                  batch_size=32,
                  distributed=False)
        assert model.evaluate([x_val_long, x_val_short], self.y_val)
        predict_result = model.predict([x_test_long, x_test_short])
        assert predict_result.shape == (self.x_test.shape[0], target_dim)
    def _hp_search(self, input_df, validation_df, metric, recipe, mc,
                   resources_per_trial, remote_dir):
        ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)
        if isinstance(input_df, list):
            feature_list = ft.get_feature_list(input_df[0])
        else:
            feature_list = ft.get_feature_list(input_df)

        def model_create_func():
            # model = VanillaLSTM(check_optional_config=False)
            _model = TimeSequenceModel(check_optional_config=False,
                                       future_seq_len=self.future_seq_len)
            return _model

        model = model_create_func()

        # prepare parameters for search engine
        search_space = recipe.search_space(feature_list)

        metric_mode = TimeSequencePredictor._get_metric_mode(metric)
        searcher = RayTuneSearchEngine(
            logs_dir=self.logs_dir,
            resources_per_trial=resources_per_trial,
            name=self.name,
            remote_dir=remote_dir,
        )
        searcher.compile(
            input_df,
            model_create_func=model_create_func(),
            search_space=search_space,
            recipe=recipe,
            feature_transformers=ft,
            future_seq_len=self.future_seq_len,
            validation_df=validation_df,
            metric=metric,
            metric_mode=metric_mode,
            mc=mc,
        )
        # searcher.test_run()
        analysis = searcher.run()

        pipeline = self._make_pipeline(analysis,
                                       metric_mode,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
예제 #8
0
    def fit_with_fixed_configs(self,
                               input_df,
                               validation_df=None,
                               mc=False,
                               **user_configs):
        """
        Fit pipeline with fixed configs. The model will be trained from initialization
        with the hyper-parameter specified in configs. The configs contain both identity configs
        (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs
        (Eg. "past_seq_len", "batch_size").
        We recommend calling get_default_configs to see the name and default values of configs you
        you can specify.
        :param input_df: one data frame or a list of data frames
        :param validation_df: one data frame or a list of data frames
        :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs"
        :return:
        """
        # self._check_configs()
        if self.config is None:
            self.config = self.get_default_configs()
        if user_configs is not None:
            self.config.update(user_configs)
        ft_id_config_set = {
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        }
        ft_id_configs = {a: self.config[a] for a in ft_id_config_set}
        self.feature_transformers = TimeSequenceFeatureTransformer(
            **ft_id_configs)
        model_id_config_set = {'future_seq_len'}
        ft_id_configs = {a: self.config[a] for a in model_id_config_set}
        self.model = TimeSequenceModel(check_optional_config=False,
                                       **ft_id_configs)
        all_available_features = self.feature_transformers.get_feature_list(
            input_df)
        self.config.update({"selected_features": all_available_features})
        (x_train, y_train) = self.feature_transformers.fit_transform(
            input_df, **self.config)
        if self._is_val_df_valid(validation_df):
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None

        self.model.fit_eval(x_train,
                            y_train,
                            validation_data=validation_data,
                            mc=mc,
                            verbose=1,
                            **self.config)
예제 #9
0
 def __init__(self, feature_transformers=None, model=None, config=None):
     """
     initialize a pipeline
     :param model: the internal model
     :param feature_transformers: the feature transformers
     """
     if feature_transformers is None:
         assert model is None and config is None
         self.feature_transformers = TimeSequenceFeatureTransformer()
         self.model = VanillaLSTM(check_optional_config=False)
         print("Initialize new time sequence pipeline.")
     else:
         self.feature_transformers = feature_transformers
         self.model = model
         self.config = config
예제 #10
0
    def test_evaluate_predict_future_more_1(self):
        target_col = "values"
        metrics = ["mse", "r2"]
        future_seq_len = np.random.randint(2, 6)
        train_df, test_df, tsp, test_sample_num = self.get_input_tsp(
            future_seq_len, target_col)
        pipeline = tsp.fit(train_df, test_df)
        mse, rs = pipeline.evaluate(test_df, metrics=metrics)
        assert len(mse) == future_seq_len
        assert len(rs) == future_seq_len
        y_pred = pipeline.predict(test_df)
        assert y_pred.shape == (test_sample_num - default_past_seq_len + 1,
                                future_seq_len + 1)

        y_pred_df = pipeline.predict(test_df[:-future_seq_len])
        columns = [
            "{}_{}".format(target_col, i) for i in range(future_seq_len)
        ]
        y_pred_value = y_pred_df[columns].values

        y_df = test_df[default_past_seq_len:]
        y_value = TimeSequenceFeatureTransformer()._roll_test(
            y_df[target_col], future_seq_len)

        mse_pred_eval, rs_pred_eval = [
            Evaluator.evaluate(m, y_value, y_pred_value) for m in metrics
        ]
        mse_eval, rs_eval = pipeline.evaluate(test_df, metrics)
        assert_array_almost_equal(mse_pred_eval, mse_eval, decimal=2)
        assert_array_almost_equal(rs_pred_eval, rs_eval, decimal=2)
예제 #11
0
 def create_feature_transformer(self):
     ft = TimeSequenceFeatureTransformer(self.future_seq_len,
                                         self.dt_col,
                                         self.target_col,
                                         self.extra_features_col,
                                         self.drop_missing)
     return ft
예제 #12
0
class TestZouwuModelLSTMForecaster(TestCase):
    def setUp(self):
        tf.keras.backend.clear_session()
        self.ft = TimeSequenceFeatureTransformer()
        self.create_data()

    def tearDown(self):
        pass

    def create_data(self):
        def gen_train_sample(data, past_seq_len, future_seq_len):
            data = pd.DataFrame(data)
            x, y = self.ft._roll_train(data,
                                       past_seq_len=past_seq_len,
                                       future_seq_len=future_seq_len)
            return x, y

        def gen_test_sample(data, past_seq_len):
            test_data = pd.DataFrame(data)
            x = self.ft._roll_test(test_data, past_seq_len=past_seq_len)
            return x

        self.long_num = 6
        self.time_step = 2
        look_back = (self.long_num + 1) * self.time_step
        look_forward = 1
        self.x_train, self.y_train = gen_train_sample(
            data=np.random.randn(64, 4),
            past_seq_len=look_back,
            future_seq_len=look_forward)
        self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4),
                                                  past_seq_len=look_back,
                                                  future_seq_len=look_forward)
        self.x_test = gen_test_sample(data=np.random.randn(16, 4),
                                      past_seq_len=look_back)

    def test_forecast_lstm(self):
        # TODO hacking to fix a bug
        model = LSTMForecaster(target_dim=1,
                               feature_dim=self.x_train.shape[-1])
        model.fit(self.x_train,
                  self.y_train,
                  validation_data=(self.x_val, self.y_val),
                  batch_size=8,
                  distributed=False)
        model.evaluate(self.x_val, self.y_val)
        model.predict(self.x_test)
예제 #13
0
def load_ts_pipeline(file):
    feature_transformers = TimeSequenceFeatureTransformer()
    model = TimeSequenceModel(check_optional_config=False)

    all_config = restore_zip(file, feature_transformers, model)
    ts_pipeline = TimeSequencePipeline(feature_transformers=feature_transformers,
                                       model=model,
                                       config=all_config)
    print("Restore pipeline from", file)
    return ts_pipeline
예제 #14
0
    def test_save_restore(self):
        train_data = pd.DataFrame(data=np.random.randn(64, 4))
        test_data = pd.DataFrame(data=np.random.randn(16, 4))
        future_seq_len = 1
        past_seq_len = 6

        # use roll method in time_sequence
        tsft = TimeSequenceFeatureTransformer()
        x_train, y_train = tsft._roll_train(train_data,
                                            past_seq_len=past_seq_len,
                                            future_seq_len=future_seq_len)
        x_test = tsft._roll_test(test_data, past_seq_len=past_seq_len)

        config = {
            'epochs': 2,
            "lr": 0.001,
            "lstm_1_units": 16,
            "dropout_1": 0.2,
            "lstm_2_units": 10,
            "dropout_2": 0.2,
            "batch_size": 32,
        }

        dirname = tempfile.mkdtemp(prefix="automl_test_vanilla")
        try:
            model = VanillaLSTM(check_optional_config=False,
                                future_seq_len=future_seq_len)
            model.fit_eval(x_train, y_train, **config)
            predict_before = model.predict(x_test)

            model_path = os.path.join(dirname, "testmodel.h5")
            config_path = os.path.join(dirname, "local_config.json")

            model.save(model_path=model_path, config_path=config_path)

            local_config = load_config(config_path)
            config.update(local_config)
            model.restore(model_path=model_path, **config)
            predict_after = model.predict(x_test)
            assert np.allclose(predict_before, predict_after)
        finally:
            shutil.rmtree(dirname)
예제 #15
0
    def setup_method(self, method):
        # super().setup_method(method)
        self.train_data = pd.DataFrame(data=np.random.randn(64, 4))
        self.val_data = pd.DataFrame(data=np.random.randn(16, 4))
        self.test_data = pd.DataFrame(data=np.random.randn(16, 4))

        self.past_seq_len = 6
        self.future_seq_len_1 = 1
        self.future_seq_len_2 = 2

        # use roll method in time_sequence
        self.feat = TimeSequenceFeatureTransformer()

        self.config = {'batch_size': 32, 'epochs': 1}

        self.model_1 = LSTMSeq2Seq(check_optional_config=False,
                                   future_seq_len=self.future_seq_len_1)
        self.model_2 = LSTMSeq2Seq(check_optional_config=False,
                                   future_seq_len=self.future_seq_len_2)

        self.fitted = False
        self.predict_1 = None
        self.predict_2 = None
 def test_dataframe_input_with_datetime(self):
     train_df, validation_df, future_seq_len = get_ts_input()
     dataframe_with_datetime = {'df': train_df, 'val_df': validation_df}
     ft = TimeSequenceFeatureTransformer(future_seq_len=future_seq_len,
                                         dt_col="datetime",
                                         target_col="value")
     searcher = prepare_searcher(
         data=dataframe_with_datetime,
         model_creator=LSTM_model_creator,
         name='test_ray_dateframe_with_datetime_with_val',
         feature_transformer=ft)
     searcher.run()
     best_trials = searcher.get_best_trials(k=1)
     assert best_trials is not None
예제 #17
0
class TestSeq2Seq(ZooTestCase):
    def setup_method(self, method):
        # super().setup_method(method)
        self.train_data = pd.DataFrame(data=np.random.randn(64, 4))
        self.val_data = pd.DataFrame(data=np.random.randn(16, 4))
        self.test_data = pd.DataFrame(data=np.random.randn(16, 4))

        self.past_seq_len = 6
        self.future_seq_len_1 = 1
        self.future_seq_len_2 = 2

        # use roll method in time_sequence
        self.feat = TimeSequenceFeatureTransformer()

        self.config = {'batch_size': 32, 'epochs': 1}

        self.model_1 = LSTMSeq2Seq(check_optional_config=False,
                                   future_seq_len=self.future_seq_len_1)
        self.model_2 = LSTMSeq2Seq(check_optional_config=False,
                                   future_seq_len=self.future_seq_len_2)

        self.fitted = False
        self.predict_1 = None
        self.predict_2 = None

    def teardown_method(self, method):
        pass

    def test_fit_eval_1(self):
        x_train_1, y_train_1 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_1)
        print("fit_eval_future_seq_len_1:",
              self.model_1.fit_eval(x_train_1, y_train_1, **self.config))
        assert self.model_1.past_seq_len == 6
        assert self.model_1.feature_num == 4
        assert self.model_1.future_seq_len == 1
        assert self.model_1.target_col_num == 1

    def test_fit_eval_2(self):
        x_train_2, y_train_2 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_2)
        print("fit_eval_future_seq_len_2:",
              self.model_2.fit_eval(x_train_2, y_train_2, **self.config))
        assert self.model_2.future_seq_len == 2

        self.fitted = True

    def test_evaluate_1(self):
        x_train_1, y_train_1 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_1)
        x_val_1, y_val_1 = self.feat._roll_train(
            self.val_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_1)

        self.model_1.fit_eval(x_train_1, y_train_1, **self.config)

        print("evaluate_future_seq_len_1:",
              self.model_1.evaluate(x_val_1, y_val_1, metric=['mse', 'r2']))

    def test_evaluate_2(self):
        x_train_2, y_train_2 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_2)
        x_val_2, y_val_2 = self.feat._roll_train(
            self.val_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_2)

        self.model_2.fit_eval(x_train_2, y_train_2, **self.config)

        print("evaluate_future_seq_len_2:",
              self.model_2.evaluate(x_val_2, y_val_2, metric=['mse', 'r2']))

    def test_predict_1(self):
        x_train_1, y_train_1 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_1)
        x_test_1 = self.feat._roll_test(self.test_data,
                                        past_seq_len=self.past_seq_len)
        self.model_1.fit_eval(x_train_1, y_train_1, **self.config)

        predict_1 = self.model_1.predict(x_test_1)
        assert predict_1.shape == (x_test_1.shape[0], self.future_seq_len_1)

    def test_predict_2(self):
        x_train_2, y_train_2 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_2)
        x_test_2 = self.feat._roll_test(self.test_data,
                                        past_seq_len=self.past_seq_len)
        self.model_2.fit_eval(x_train_2, y_train_2, **self.config)

        predict_2 = self.model_2.predict(x_test_2)
        assert predict_2.shape == (x_test_2.shape[0], self.future_seq_len_2)

    def test_save_restore_1(self):
        x_train_1, y_train_1 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_1)
        x_test_1 = self.feat._roll_test(self.test_data,
                                        past_seq_len=self.past_seq_len)
        self.model_1.fit_eval(x_train_1, y_train_1, **self.config)

        predict_1_before = self.model_1.predict(x_test_1)
        new_model_1 = LSTMSeq2Seq(check_optional_config=False)

        dirname = tempfile.mkdtemp(prefix="automl_test_feature")
        try:
            save(dirname, model=self.model_1)
            restore(dirname, model=new_model_1, config=self.config)
            predict_1_after = new_model_1.predict(x_test_1)
            assert_array_almost_equal(predict_1_before, predict_1_after, decimal=2), \
                "Prediction values are not the same after restore: " \
                "predict before is {}, and predict after is {}".format(predict_1_before,
                                                                       predict_1_after)
            new_config = {'epochs': 1}
            new_model_1.fit_eval(x_train_1, y_train_1, **new_config)
        finally:
            shutil.rmtree(dirname)

    def test_save_restore_2(self):
        x_train_2, y_train_2 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_2)
        x_test_2 = self.feat._roll_test(self.test_data,
                                        past_seq_len=self.past_seq_len)
        self.model_2.fit_eval(x_train_2, y_train_2, **self.config)

        predict_2_before = self.model_2.predict(x_test_2)
        new_model_2 = LSTMSeq2Seq(check_optional_config=False)

        dirname = tempfile.mkdtemp(prefix="automl_test_feature")
        try:
            save(dirname, model=self.model_2)
            restore(dirname, model=new_model_2, config=self.config)
            predict_2_after = new_model_2.predict(x_test_2)
            assert_array_almost_equal(predict_2_before, predict_2_after, decimal=2), \
                "Prediction values are not the same after restore: " \
                "predict before is {}, and predict after is {}".format(predict_2_before,
                                                                       predict_2_after)
            new_config = {'epochs': 2}
            new_model_2.fit_eval(x_train_2, y_train_2, **new_config)
        finally:
            shutil.rmtree(dirname)

    def test_predict_with_uncertainty(self, ):
        x_train_2, y_train_2 = self.feat._roll_train(
            self.train_data,
            past_seq_len=self.past_seq_len,
            future_seq_len=self.future_seq_len_2)
        x_test_2 = self.feat._roll_test(self.test_data,
                                        past_seq_len=self.past_seq_len)
        self.model_2.fit_eval(x_train_2, y_train_2, mc=True, **self.config)
        prediction, uncertainty = self.model_2.predict_with_uncertainty(
            x_test_2, n_iter=2)
        assert prediction.shape == (x_test_2.shape[0], self.future_seq_len_2)
        assert uncertainty.shape == (x_test_2.shape[0], self.future_seq_len_2)
        assert np.any(uncertainty)

        new_model_2 = LSTMSeq2Seq(check_optional_config=False)
        dirname = tempfile.mkdtemp(prefix="automl_test_feature")
        try:
            save(dirname, model=self.model_2)
            restore(dirname, model=new_model_2, config=self.config)
            prediction, uncertainty = new_model_2.predict_with_uncertainty(
                x_test_2, n_iter=2)
            assert prediction.shape == (x_test_2.shape[0],
                                        self.future_seq_len_2)
            assert uncertainty.shape == (x_test_2.shape[0],
                                         self.future_seq_len_2)
            assert np.any(uncertainty)
        finally:
            shutil.rmtree(dirname)
예제 #18
0
class TestMTNetKeras(ZooTestCase):

    def setup_method(self, method):
        tf.keras.backend.clear_session()
        self.ft = TimeSequenceFeatureTransformer()
        self.create_data()
        self.model = MTNetKeras()
        self.config = {"long_num": self.long_num,
                       "time_step": self.time_step,
                       "ar_window": np.random.randint(1, 3),
                       "cnn_height": np.random.randint(1, 3),
                       "epochs": 1}

    def teardown_method(self, method):
        pass

    def create_data(self):
        def gen_train_sample(data, past_seq_len, future_seq_len):
            data = pd.DataFrame(data)
            x, y = self.ft._roll_train(data,
                                       past_seq_len=past_seq_len,
                                       future_seq_len=future_seq_len
                                       )
            return x, y

        def gen_test_sample(data, past_seq_len):
            test_data = pd.DataFrame(data)
            x = self.ft._roll_test(test_data, past_seq_len=past_seq_len)
            return x

        self.long_num = 6
        self.time_step = 2
        look_back = (self.long_num + 1) * self.time_step
        look_forward = 1
        self.x_train, self.y_train = gen_train_sample(data=np.random.randn(
            64, 4), past_seq_len=look_back, future_seq_len=look_forward)
        self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4),
                                                  past_seq_len=look_back,
                                                  future_seq_len=look_forward)
        self.x_test = gen_test_sample(data=np.random.randn(16, 4),
                                      past_seq_len=look_back)

    def test_fit_evaluate(self):
        self.model.fit_eval(self.x_train, self.y_train,
                            validation_data=(self.x_val, self.y_val),
                            **self.config)
        self.model.evaluate(self.x_val, self.y_val)

    def test_save_restore(self):
        self.model.fit_eval(self.x_train, self.y_train,
                            validation_data=(self.x_val, self.y_val),
                            **self.config)
        y_pred = self.model.predict(self.x_test)
        assert y_pred.shape == (self.x_test.shape[0], self.y_train.shape[1])
        dirname = "tmp"
        restored_model = MTNetKeras()
        try:
            save(dirname, model=self.model)
            restore(dirname, model=restored_model, config=self.config)
            predict_after = restored_model.predict(self.x_test)
            assert_array_almost_equal(y_pred, predict_after, decimal=2), \
                "Prediction values are not the same after restore: " \
                "predict before is {}, and predict after is {}".format(y_pred, predict_after)
            restored_model.fit_eval(self.x_train, self.y_train, epochs=1)
            restored_model.evaluate(self.x_val, self.y_val)
        finally:
            shutil.rmtree("tmp")

    def test_predict_with_uncertainty(self):
        self.model.fit_eval(self.x_train, self.y_train,
                            validation_data=(self.x_val, self.y_val),
                            mc=True,
                            **self.config)
        pred, uncertainty = self.model.predict_with_uncertainty(self.x_test, n_iter=2)
        assert pred.shape == (self.x_test.shape[0], self.y_train.shape[1])
        assert uncertainty.shape == pred.shape
        assert np.any(uncertainty)
예제 #19
0
 def setUp(self):
     tf.keras.backend.clear_session()
     self.ft = TimeSequenceFeatureTransformer()
     self.create_data()
예제 #20
0
class TestZouwuModelForecast(ZooTestCase):
    def setup_method(self, method):
        tf.keras.backend.clear_session()
        # super(TestZouwuModelForecast, self).setup_method(method)
        self.ft = TimeSequenceFeatureTransformer()
        self.create_data()

    def teardown_method(self, method):
        pass

    def create_data(self):
        def gen_train_sample(data, past_seq_len, future_seq_len):
            data = pd.DataFrame(data)
            x, y = self.ft._roll_train(data,
                                       past_seq_len=past_seq_len,
                                       future_seq_len=future_seq_len)
            return x, y

        def gen_test_sample(data, past_seq_len):
            test_data = pd.DataFrame(data)
            x = self.ft._roll_test(test_data, past_seq_len=past_seq_len)
            return x

        self.long_num = 6
        self.time_step = 2
        look_back = (self.long_num + 1) * self.time_step
        look_forward = 1
        self.x_train, self.y_train = gen_train_sample(
            data=np.random.randn(64, 4),
            past_seq_len=look_back,
            future_seq_len=look_forward)
        self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4),
                                                  past_seq_len=look_back,
                                                  future_seq_len=look_forward)
        self.x_test = gen_test_sample(data=np.random.randn(16, 4),
                                      past_seq_len=look_back)

    def test_forecast_lstm(self):
        # TODO hacking to fix a bug
        model = LSTMForecaster(target_dim=1,
                               feature_dim=self.x_train.shape[-1])
        model.fit(self.x_train,
                  self.y_train,
                  validation_data=(self.x_val, self.y_val),
                  batch_size=8,
                  distributed=False)
        model.evaluate(self.x_val, self.y_val)
        model.predict(self.x_test)

    def test_forecast_mtnet(self):
        # TODO hacking to fix a bug
        model = MTNetForecaster(target_dim=1,
                                feature_dim=self.x_train.shape[-1],
                                lb_long_steps=self.long_num,
                                lb_long_stepsize=self.time_step)
        x_train_long, x_train_short = model.preprocess_input(self.x_train)
        x_val_long, x_val_short = model.preprocess_input(self.x_val)
        x_test_long, x_test_short = model.preprocess_input(self.x_test)

        model.fit([x_train_long, x_train_short],
                  self.y_train,
                  validation_data=([x_val_long, x_val_short], self.y_val),
                  batch_size=32,
                  distributed=False)
        model.evaluate([x_val_long, x_val_short], self.y_val)
        model.predict([x_test_long, x_test_short])

    def test_forecast_tcmf(self):
        from zoo.zouwu.model.forecast import TCMFForecaster
        model = TCMFForecaster(max_y_iterations=1,
                               init_XF_epoch=1,
                               max_FX_epoch=1,
                               max_TCN_epoch=1,
                               alt_iters=2)
        x = np.random.rand(300, 480)
        model.fit(x)
        model.predict(x=None, horizon=24)
        target_value = np.random.rand(300, 24)
        model.evaluate(x=None, target_value=target_value, metric=['mse'])
예제 #21
0
class TimeSequencePipeline(Pipeline):
    def __init__(self,
                 feature_transformers=None,
                 model=None,
                 config=None,
                 name=None):
        """
        initialize a pipeline
        :param model: the internal model
        :param feature_transformers: the feature transformers
        """
        self.feature_transformers = feature_transformers
        self.model = model
        self.config = config
        self.name = name
        self.time = time.strftime("%Y%m%d-%H%M%S")

    def describe(self):
        init_info = [
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        ]
        print("**** Initialization info ****")
        for info in init_info:
            print(info + ":", self.config[info])
        print("")

    def fit(self, input_df, validation_df=None, mc=False, epoch_num=20):
        x, y = self.feature_transformers.transform(input_df, is_train=True)
        if validation_df is not None and not validation_df.empty:
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None
        new_config = {'epochs': epoch_num}
        self.model.fit_eval(x,
                            y,
                            validation_data,
                            mc=mc,
                            verbose=1,
                            **new_config)
        print('Fit done!')

    def _is_val_df_valid(self, validation_df):
        df_not_empty = isinstance(validation_df,
                                  pd.DataFrame) and not validation_df.empty
        df_list_not_empty = isinstance(validation_df, list) \
            and validation_df and not all([d.empty for d in validation_df])
        if validation_df is not None and (df_not_empty or df_list_not_empty):
            return True
        else:
            return False

    def _check_configs(self):
        required_configs = {'future_seq_len'}
        if not self.config.keys() & required_configs:
            raise ValueError("Missing required parameters in configuration. " +
                             "Required parameters are: " +
                             str(required_configs))
        default_config = {
            'dt_col': 'datetime',
            'target_col': 'value',
            'extra_features_col': None,
            'drop_missing': True,
            'past_seq_len': 2,
            'batch_size': 64,
            'lr': 0.001,
            'dropout': 0.2,
            'epochs': 10,
            'metric': 'mse'
        }
        for config, value in default_config.items():
            if config not in self.config:
                print('Config: \'{}\' is not specified. '
                      'A default value of {} will be used.'.format(
                          config, value))

    def get_default_configs(self):
        default_configs = {
            'dt_col': 'datetime',
            'target_col': 'value',
            'extra_features_col': None,
            'drop_missing': True,
            'future_seq_len': 1,
            'past_seq_len': 2,
            'batch_size': 64,
            'lr': 0.001,
            'dropout': 0.2,
            'epochs': 10,
            'metric': 'mean_squared_error'
        }
        print("**** default config: ****")
        for config in default_configs:
            print(config + ":", default_configs[config])
        print(
            "You can change any fields in the default configs by passing into "
            "fit_with_fixed_configs(). Otherwise, the default values will be used."
        )
        return default_configs

    def fit_with_fixed_configs(self,
                               input_df,
                               validation_df=None,
                               mc=False,
                               **user_configs):
        """
        Fit pipeline with fixed configs. The model will be trained from initialization
        with the hyper-parameter specified in configs. The configs contain both identity configs
        (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs
        (Eg. "past_seq_len", "batch_size").
        We recommend calling get_default_configs to see the name and default values of configs you
        you can specify.
        :param input_df: one data frame or a list of data frames
        :param validation_df: one data frame or a list of data frames
        :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs"
        :return:
        """
        # self._check_configs()
        if self.config is None:
            self.config = self.get_default_configs()
        if user_configs is not None:
            self.config.update(user_configs)
        ft_id_config_set = {
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        }
        ft_id_configs = {a: self.config[a] for a in ft_id_config_set}
        self.feature_transformers = TimeSequenceFeatureTransformer(
            **ft_id_configs)
        model_id_config_set = {'future_seq_len'}
        ft_id_configs = {a: self.config[a] for a in model_id_config_set}
        self.model = TimeSequenceModel(check_optional_config=False,
                                       **ft_id_configs)
        all_available_features = self.feature_transformers.get_feature_list(
            input_df)
        self.config.update({"selected_features": all_available_features})
        (x_train, y_train) = self.feature_transformers.fit_transform(
            input_df, **self.config)
        if self._is_val_df_valid(validation_df):
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None

        self.model.fit_eval(x_train,
                            y_train,
                            validation_data=validation_data,
                            mc=mc,
                            verbose=1,
                            **self.config)

    def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'):
        """
        evaluate the pipeline
        :param input_df:
        :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE']
        :param multioutput: string in ['raw_values', 'uniform_average']
                'raw_values' :
                    Returns a full set of errors in case of multioutput input.
                'uniform_average' :
                    Errors of all outputs are averaged with uniform weight.
        :return:
        """
        if isinstance(metrics, str):
            metrics = [metrics]
        # if not isinstance(metrics, list):
        #    raise ValueError("Expected metrics to be a list!")

        x, y = self.feature_transformers.transform(input_df, is_train=True)
        y_pred = self.model.predict(x)
        if y_pred.shape[1] == 1:
            multioutput = 'uniform_average'
        y_unscale, y_pred_unscale = self.feature_transformers.post_processing(
            input_df, y_pred, is_train=True)

        return [
            Evaluator.evaluate(m,
                               y_unscale,
                               y_pred_unscale,
                               multioutput=multioutput) for m in metrics
        ]

    def predict(self, input_df):
        """
        predict test data with the pipeline fitted
        :param input_df:
        :return:
        """
        x, _ = self.feature_transformers.transform(input_df, is_train=False)
        y_pred = self.model.predict(x)
        y_output = self.feature_transformers.post_processing(input_df,
                                                             y_pred,
                                                             is_train=False)
        return y_output

    def predict_with_uncertainty(self, input_df, n_iter=100):
        x, _ = self.feature_transformers.transform(input_df, is_train=False)
        y_pred, y_pred_uncertainty = self.model.predict_with_uncertainty(
            x=x, n_iter=n_iter)
        y_output = self.feature_transformers.post_processing(input_df,
                                                             y_pred,
                                                             is_train=False)
        y_uncertainty = self.feature_transformers.unscale_uncertainty(
            y_pred_uncertainty)
        return y_output, y_uncertainty

    def save(self, ppl_file=None):
        """
        save pipeline to file, contains feature transformer, model, trial config.
        :param ppl_file:
        :return:
        """
        ppl_file = ppl_file or os.path.join(
            DEFAULT_PPL_DIR, "{}_{}.ppl".format(self.name, self.time))
        save_zip(ppl_file, self.feature_transformers, self.model, self.config)
        print("Pipeline is saved in", ppl_file)
        return ppl_file

    def config_save(self, config_file=None):
        """
        save all configs to file.
        :param config_file:
        :return:
        """
        config_file = config_file or os.path.join(
            DEFAULT_CONFIG_DIR, "{}_{}.json".format(self.name, self.time))
        save_config(config_file, self.config, replace=True)
        return config_file
예제 #22
0
    def _hp_search(self, input_df, validation_df, metric):
        # features
        # feature_list = ["WEEKDAY(datetime)", "HOUR(datetime)",
        #                "PERCENTILE(value)", "IS_WEEKEND(datetime)",
        #                "IS_AWAKE(datetime)", "IS_BUSY_HOURS(datetime)"
        #                # "DAY(datetime)","MONTH(datetime)", #probabaly not useful
        #                ]
        # target_list = ["value"]
        # ft = TimeSequenceFeatures(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col)

        # ft = DummyTimeSequenceFeatures(file_path='../../../../data/nyc_taxi_rolled_split.npz')
        ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)

        feature_list = ft.get_feature_list(input_df)
        # model
        model = VanillaLSTM(check_optional_config=False,
                            future_seq_len=self.future_seq_len)

        search_space = {
            # -------- feature related parameters
            "selected_features":
            RandomSample(lambda spec: np.random.choice(
                feature_list,
                size=np.random.randint(low=3, high=len(feature_list), size=1),
                replace=False)),

            # --------- model related parameters
            # 'input_shape_x': x_train.shape[1],
            # 'input_shape_y': x_train.shape[-1],
            'out_units':
            self.future_seq_len,
            "lr":
            0.001,
            "lstm_1_units":
            GridSearch([16, 32]),
            "dropout_1":
            0.2,
            "lstm_2_units":
            10,
            "dropout_2":
            RandomSample(lambda spec: np.random.uniform(0.2, 0.5)),
            "batch_size":
            1024,
        }

        stop = {"reward_metric": -0.05, "training_iteration": 10}

        searcher = RayTuneSearchEngine(logs_dir=self.logs_dir,
                                       ray_num_cpus=6,
                                       resources_per_trial={"cpu": 2})
        searcher.compile(
            input_df,
            search_space=search_space,
            stop=stop,
            # feature_transformers=TimeSequenceFeatures,
            feature_transformers=ft,  # use dummy features for testing the rest
            model=model,
            validation_df=validation_df,
            metric=metric)
        # searcher.test_run()

        trials = searcher.run()
        best = searcher.get_best_trials(
            k=1)[0]  # get the best one trial, later could be n
        pipeline = self._make_pipeline(
            best,
            feature_transformers=ft,
            # feature_transformers=TimeSequenceFeatures(
            #     file_path='../../../../data/nyc_taxi_rolled_split.npz'),
            model=VanillaLSTM(check_optional_config=False))
        return pipeline
예제 #23
0
class TestZouwuModelForecast(ZooTestCase):
    def setup_method(self, method):
        tf.keras.backend.clear_session()
        # super(TestZouwuModelForecast, self).setup_method(method)
        self.ft = TimeSequenceFeatureTransformer()
        self.create_data()

    def teardown_method(self, method):
        pass

    def create_data(self):
        def gen_train_sample(data, past_seq_len, future_seq_len):
            data = pd.DataFrame(data)
            x, y = self.ft._roll_train(data,
                                       past_seq_len=past_seq_len,
                                       future_seq_len=future_seq_len)
            return x, y

        def gen_test_sample(data, past_seq_len):
            test_data = pd.DataFrame(data)
            x = self.ft._roll_test(test_data, past_seq_len=past_seq_len)
            return x

        self.long_num = 6
        self.time_step = 2
        look_back = (self.long_num + 1) * self.time_step
        look_forward = 1
        self.x_train, self.y_train = gen_train_sample(
            data=np.random.randn(64, 4),
            past_seq_len=look_back,
            future_seq_len=look_forward)
        self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4),
                                                  past_seq_len=look_back,
                                                  future_seq_len=look_forward)
        self.x_test = gen_test_sample(data=np.random.randn(16, 4),
                                      past_seq_len=look_back)

    def test_forecast_lstm(self):
        # TODO hacking to fix a bug
        model = LSTMForecaster(horizon=1, feature_dim=self.x_train.shape[-1])
        model.fit(self.x_train,
                  self.y_train,
                  validation_data=(self.x_val, self.y_val),
                  batch_size=8,
                  distributed=False)
        model.evaluate(self.x_val, self.y_val)
        model.predict(self.x_test)

    def test_forecast_mtnet(self):
        # TODO hacking to fix a bug
        model = MTNetForecaster(horizon=1,
                                feature_dim=self.x_train.shape[-1],
                                lb_long_steps=self.long_num,
                                lb_long_stepsize=self.time_step)
        x_train_long, x_train_short = model.preprocess_input(self.x_train)
        x_val_long, x_val_short = model.preprocess_input(self.x_val)
        x_test_long, x_test_short = model.preprocess_input(self.x_test)

        model.fit([x_train_long, x_train_short],
                  self.y_train,
                  validation_data=([x_val_long, x_val_short], self.y_val),
                  batch_size=32,
                  distributed=False)
        model.evaluate([x_val_long, x_val_short], self.y_val)
        model.predict([x_test_long, x_test_short])
예제 #24
0
 def setup_method(self, method):
     tf.keras.backend.clear_session()
     # super(TestZouwuModelForecast, self).setup_method(method)
     self.ft = TimeSequenceFeatureTransformer()
     self.create_data()
예제 #25
0
class TestZouwuModelForecast(ZooTestCase):
    def setup_method(self, method):
        tf.keras.backend.clear_session()
        # super(TestZouwuModelForecast, self).setup_method(method)
        self.ft = TimeSequenceFeatureTransformer()
        self.create_data()

    def teardown_method(self, method):
        pass

    def create_data(self):
        def gen_train_sample(data, past_seq_len, future_seq_len):
            data = pd.DataFrame(data)
            x, y = self.ft._roll_train(data,
                                       past_seq_len=past_seq_len,
                                       future_seq_len=future_seq_len)
            return x, y

        def gen_test_sample(data, past_seq_len):
            test_data = pd.DataFrame(data)
            x = self.ft._roll_test(test_data, past_seq_len=past_seq_len)
            return x

        self.long_num = 6
        self.time_step = 2
        look_back = (self.long_num + 1) * self.time_step
        look_forward = 1
        self.x_train, self.y_train = gen_train_sample(
            data=np.random.randn(64, 4),
            past_seq_len=look_back,
            future_seq_len=look_forward)
        self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4),
                                                  past_seq_len=look_back,
                                                  future_seq_len=look_forward)
        self.x_test = gen_test_sample(data=np.random.randn(16, 4),
                                      past_seq_len=look_back)

    def test_forecast_lstm(self):
        # TODO hacking to fix a bug
        model = LSTMForecaster(target_dim=1,
                               feature_dim=self.x_train.shape[-1])
        model.fit(self.x_train,
                  self.y_train,
                  validation_data=(self.x_val, self.y_val),
                  batch_size=8,
                  distributed=False)
        model.evaluate(self.x_val, self.y_val)
        model.predict(self.x_test)

    def test_forecast_mtnet(self):
        # TODO hacking to fix a bug
        model = MTNetForecaster(target_dim=1,
                                feature_dim=self.x_train.shape[-1],
                                long_series_num=self.long_num,
                                series_length=self.time_step)
        x_train_long, x_train_short = model.preprocess_input(self.x_train)
        x_val_long, x_val_short = model.preprocess_input(self.x_val)
        x_test_long, x_test_short = model.preprocess_input(self.x_test)

        model.fit([x_train_long, x_train_short],
                  self.y_train,
                  validation_data=([x_val_long, x_val_short], self.y_val),
                  batch_size=32,
                  distributed=False)
        model.evaluate([x_val_long, x_val_short], self.y_val)
        model.predict([x_test_long, x_test_short])

    def test_forecast_tcmf(self):
        from zoo.zouwu.model.forecast import TCMFForecaster
        import tempfile
        model = TCMFForecaster(max_y_iterations=1,
                               init_FX_epoch=1,
                               max_FX_epoch=1,
                               max_TCN_epoch=1,
                               alt_iters=2)
        horizon = np.random.randint(1, 50)
        # construct data
        id = np.arange(300)
        data = np.random.rand(300, 480)
        input = dict({'data': data})
        with self.assertRaises(Exception) as context:
            model.fit(input)
        self.assertTrue("key `y` doesn't exist in x" in str(context.exception))
        input = dict({'id': id, 'y': data})
        with self.assertRaises(Exception) as context:
            model.is_distributed()
        self.assertTrue('You should run fit before calling is_distributed()' in
                        str(context.exception))
        model.fit(input)
        assert not model.is_distributed()
        with self.assertRaises(Exception) as context:
            model.fit(input)
        self.assertTrue('This model has already been fully trained' in str(
            context.exception))
        with self.assertRaises(Exception) as context:
            model.fit(input, incremental=True)
        self.assertTrue(
            'NotImplementedError' in context.exception.__class__.__name__)
        with tempfile.TemporaryDirectory() as tempdirname:
            model.save(tempdirname)
            loaded_model = TCMFForecaster.load(tempdirname, distributed=False)
        yhat = model.predict(x=None, horizon=horizon)
        yhat_loaded = loaded_model.predict(x=None, horizon=horizon)
        yhat_id = yhat_loaded["id"]
        assert (yhat_id == id).all()
        yhat = yhat["prediction"]
        yhat_loaded = yhat_loaded["prediction"]
        assert yhat.shape == (300, horizon)
        assert (yhat == yhat_loaded).all()
        target_value = np.random.rand(300, horizon)
        target_value = dict({"y": target_value})
        model.evaluate(x=None, target_value=target_value, metric=['mse'])

    def test_forecast_tcmf_without_id(self):
        from zoo.zouwu.model.forecast import TCMFForecaster
        import tempfile
        model = TCMFForecaster(max_y_iterations=1,
                               init_FX_epoch=1,
                               max_FX_epoch=1,
                               max_TCN_epoch=1,
                               alt_iters=2)
        horizon = np.random.randint(1, 50)
        # construct data
        id = np.arange(200)
        data = np.random.rand(300, 480)
        input = dict({'y': "abc"})
        with self.assertRaises(Exception) as context:
            model.fit(input)
        self.assertTrue(
            "the value of y should be an ndarray" in str(context.exception))
        input = dict({'id': id, 'y': data})
        with self.assertRaises(Exception) as context:
            model.fit(input)
        self.assertTrue(
            "the length of the id array should be equal to the number of" in
            str(context.exception))
        input = dict({'y': data})
        model.fit(input)
        assert not model.is_distributed()
        with self.assertRaises(Exception) as context:
            model.fit(input)
        self.assertTrue('This model has already been fully trained' in str(
            context.exception))
        with tempfile.TemporaryDirectory() as tempdirname:
            model.save(tempdirname)
            loaded_model = TCMFForecaster.load(tempdirname, distributed=False)
        yhat = model.predict(x=None, horizon=horizon)
        yhat_loaded = loaded_model.predict(x=None, horizon=horizon)
        assert "id" not in yhat_loaded
        yhat = yhat["prediction"]
        yhat_loaded = yhat_loaded["prediction"]
        assert yhat.shape == (300, horizon)
        assert (yhat == yhat_loaded).all()
        target_value = np.random.rand(300, horizon)
        target_value_fake = dict({"data": target_value})
        with self.assertRaises(Exception) as context:
            model.evaluate(x=None,
                           target_value=target_value_fake,
                           metric=['mse'])
        self.assertTrue("key y doesn't exist in y" in str(context.exception))
        target_value = dict({"y": target_value})
        model.evaluate(x=None, target_value=target_value, metric=['mse'])

    def test_forecast_tcmf_xshards(self):
        from zoo.zouwu.model.forecast import TCMFForecaster
        from zoo.orca import OrcaContext
        import zoo.orca.data.pandas
        import tempfile
        OrcaContext.pandas_read_backend = "pandas"

        def preprocessing(df, id_name, y_name):
            id = df.index
            data = df.to_numpy()
            result = dict({id_name: id, y_name: data})
            return result

        def postprocessing(pred_results, output_dt_col_name):
            id_arr = pred_results["id"]
            pred_results = pred_results["prediction"]
            pred_results = np.concatenate(
                (np.expand_dims(id_arr, axis=1), pred_results), axis=1)
            final_df = pd.DataFrame(pred_results,
                                    columns=["id"] + output_dt_col_name)
            final_df.id = final_df.id.astype("int")
            final_df = final_df.set_index("id")
            final_df.columns.name = "datetime"
            final_df = final_df.unstack().reset_index().rename(
                {0: "prediction"}, axis=1)
            return final_df

        def get_pred(d):
            return d["prediction"]

        model = TCMFForecaster(max_y_iterations=1,
                               init_FX_epoch=1,
                               max_FX_epoch=1,
                               max_TCN_epoch=1,
                               alt_iters=2)

        with tempfile.NamedTemporaryFile() as temp:
            data = np.random.rand(300, 480)
            df = pd.DataFrame(data)
            df.to_csv(temp.name)
            shard = zoo.orca.data.pandas.read_csv(temp.name)
        shard.cache()
        shard_train = shard.transform_shard(preprocessing, 'id', 'data')
        with self.assertRaises(Exception) as context:
            model.fit(shard_train)
        self.assertTrue("key `y` doesn't exist in x" in str(context.exception))
        shard_train = shard.transform_shard(preprocessing, 'cid', 'y')
        with self.assertRaises(Exception) as context:
            model.fit(shard_train)
        self.assertTrue(
            "key `id` doesn't exist in x" in str(context.exception))
        with self.assertRaises(Exception) as context:
            model.is_distributed()
        self.assertTrue('You should run fit before calling is_distributed()' in
                        str(context.exception))
        shard_train = shard.transform_shard(preprocessing, 'id', 'y')
        model.fit(shard_train)
        assert model.is_distributed()
        with self.assertRaises(Exception) as context:
            model.fit(shard_train)
        self.assertTrue('This model has already been fully trained' in str(
            context.exception))
        with self.assertRaises(Exception) as context:
            model.fit(shard_train, incremental=True)
        self.assertTrue(
            'NotImplementedError' in context.exception.__class__.__name__)
        with tempfile.TemporaryDirectory() as tempdirname:
            model.save(tempdirname + "/model")
            loaded_model = TCMFForecaster.load(tempdirname + "/model",
                                               distributed=True)
        horizon = np.random.randint(1, 50)
        yhat_shard_origin = model.predict(x=None, horizon=horizon)
        yhat_list_origin = yhat_shard_origin.collect()
        yhat_list_origin = list(map(get_pred, yhat_list_origin))
        yhat_shard = loaded_model.predict(x=None, horizon=horizon)
        yhat_list = yhat_shard.collect()
        yhat_list = list(map(get_pred, yhat_list))
        yhat_origin = np.concatenate(yhat_list_origin)
        yhat = np.concatenate(yhat_list)
        assert yhat.shape == (300, horizon)
        assert (yhat == yhat_origin).all()
        output_dt_col_name = pd.date_range(start='2020-05-01',
                                           periods=horizon,
                                           freq='H').to_list()
        yhat_df_shards = yhat_shard.transform_shard(postprocessing,
                                                    output_dt_col_name)
        final_df_list = yhat_df_shards.collect()
        final_df = pd.concat(final_df_list)
        final_df.sort_values("datetime", inplace=True)
        assert final_df.shape == (300 * horizon, 3)
        OrcaContext.pandas_read_backend = "spark"
예제 #26
0
 def setup_method(self, method):
     self.ft = TimeSequenceFeatureTransformer()
     self.create_data()
예제 #27
0
class TimeSequencePipeline(Pipeline):
    def __init__(self, feature_transformers=None, model=None, config=None):
        """
        initialize a pipeline
        :param model: the internal model
        :param feature_transformers: the feature transformers
        """
        if feature_transformers is None:
            assert model is None and config is None
            self.feature_transformers = TimeSequenceFeatureTransformer()
            self.model = VanillaLSTM(check_optional_config=False)
            print("Initialize new time sequence pipeline.")
        else:
            self.feature_transformers = feature_transformers
            self.model = model
            self.config = config

    def evaluate(self, input_df, metric=["mean_squared_error"]):
        """
        evaluate the pipeline
        :param input_df:
        :param metric:
        :return:
        """
        x, y = self.feature_transformers.transform(input_df, is_train=True)
        return self.model.evaluate(x, y, metric)

    def predict(self, input_df):
        # there might be no y in the data, TODO needs to fix in TimeSquenceFeatures
        x = self.feature_transformers.transform(input_df, is_train=False)
        y_pred = self.model.predict(x)
        y_output = self.feature_transformers.post_processing(y_pred)
        return y_output

    def save(self, file):
        """
        save pipeline to file, contains feature transformer, model, trial config.
        :param file:
        :return:
        """
        if not os.path.isdir(file):
            os.mkdir(file)
        model_path = os.path.join(file, "weights_tune.h5")
        config_path = os.path.join(file, "all_config.json")
        self.feature_transformers.save(config_path, replace=True)
        self.model.save(model_path, config_path)
        # check if ** is needed
        save_config(config_path, self.config)

    def restore(self, file):
        """
        restore pipeline from file
        :param file:
        :param config:
        :return:
        """
        model_path = os.path.join(file, "weights_tune.h5")
        config_path = os.path.join(file, "all_config.json")
        all_config = load_config(config_path)
        self.model.restore(model_path, **all_config)
        self.feature_transformers.restore(**all_config)