예제 #1
0
    def _hp_search(self,
                   input_df,
                   validation_df,
                   metric,
                   recipe,
                   mc,
                   resources_per_trial,
                   remote_dir):

        ft = TimeSequenceFeatureTransformer(self.future_seq_len,
                                            self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)
        if isinstance(input_df, list):
            feature_list = ft.get_feature_list(input_df[0])
        else:
            feature_list = ft.get_feature_list(input_df)

        # model = VanillaLSTM(check_optional_config=False)
        model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len)

        # prepare parameters for search engine
        search_space = recipe.search_space(feature_list)
        runtime_params = recipe.runtime_params()
        num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        search_algorithm_params = recipe.search_algorithm_params()
        search_algorithm = recipe.search_algorithm()
        fixed_params = recipe.fixed_params()
        del stop['num_samples']

        searcher = RayTuneSearchEngine(logs_dir=self.logs_dir,
                                       resources_per_trial=resources_per_trial,
                                       name=self.name,
                                       remote_dir=remote_dir,
                                       )
        searcher.compile(input_df,
                         search_space=search_space,
                         stop=stop,
                         search_algorithm_params=search_algorithm_params,
                         search_algorithm=search_algorithm,
                         fixed_params=fixed_params,
                         # feature_transformers=TimeSequenceFeatures,
                         feature_transformers=ft,
                         # model=model,
                         future_seq_len=self.future_seq_len,
                         validation_df=validation_df,
                         metric=metric,
                         mc=mc,
                         num_samples=num_samples)
        # searcher.test_run()
        searcher.run()

        best = searcher.get_best_trials(k=1)[0]  # get the best one trial, later could be n
        pipeline = self._make_pipeline(best,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
    def _hp_search(self, input_df, validation_df, metric, recipe, mc,
                   resources_per_trial, remote_dir):
        ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)
        if isinstance(input_df, list):
            feature_list = ft.get_feature_list(input_df[0])
        else:
            feature_list = ft.get_feature_list(input_df)

        def model_create_func():
            # model = VanillaLSTM(check_optional_config=False)
            _model = TimeSequenceModel(check_optional_config=False,
                                       future_seq_len=self.future_seq_len)
            return _model

        model = model_create_func()

        # prepare parameters for search engine
        search_space = recipe.search_space(feature_list)

        metric_mode = TimeSequencePredictor._get_metric_mode(metric)
        searcher = RayTuneSearchEngine(
            logs_dir=self.logs_dir,
            resources_per_trial=resources_per_trial,
            name=self.name,
            remote_dir=remote_dir,
        )
        searcher.compile(
            input_df,
            model_create_func=model_create_func(),
            search_space=search_space,
            recipe=recipe,
            feature_transformers=ft,
            future_seq_len=self.future_seq_len,
            validation_df=validation_df,
            metric=metric,
            metric_mode=metric_mode,
            mc=mc,
        )
        # searcher.test_run()
        analysis = searcher.run()

        pipeline = self._make_pipeline(analysis,
                                       metric_mode,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
예제 #3
0
class TimeSequencePipeline(Pipeline):
    def __init__(self,
                 feature_transformers=None,
                 model=None,
                 config=None,
                 name=None):
        """
        initialize a pipeline
        :param model: the internal model
        :param feature_transformers: the feature transformers
        """
        self.feature_transformers = feature_transformers
        self.model = model
        self.config = config
        self.name = name
        self.time = time.strftime("%Y%m%d-%H%M%S")

    def describe(self):
        init_info = [
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        ]
        print("**** Initialization info ****")
        for info in init_info:
            print(info + ":", self.config[info])
        print("")

    def fit(self, input_df, validation_df=None, mc=False, epoch_num=20):
        x, y = self.feature_transformers.transform(input_df, is_train=True)
        if validation_df is not None and not validation_df.empty:
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None
        new_config = {'epochs': epoch_num}
        self.model.fit_eval(x,
                            y,
                            validation_data,
                            mc=mc,
                            verbose=1,
                            **new_config)
        print('Fit done!')

    def _is_val_df_valid(self, validation_df):
        df_not_empty = isinstance(validation_df,
                                  pd.DataFrame) and not validation_df.empty
        df_list_not_empty = isinstance(validation_df, list) \
            and validation_df and not all([d.empty for d in validation_df])
        if validation_df is not None and (df_not_empty or df_list_not_empty):
            return True
        else:
            return False

    def _check_configs(self):
        required_configs = {'future_seq_len'}
        if not self.config.keys() & required_configs:
            raise ValueError("Missing required parameters in configuration. " +
                             "Required parameters are: " +
                             str(required_configs))
        default_config = {
            'dt_col': 'datetime',
            'target_col': 'value',
            'extra_features_col': None,
            'drop_missing': True,
            'past_seq_len': 2,
            'batch_size': 64,
            'lr': 0.001,
            'dropout': 0.2,
            'epochs': 10,
            'metric': 'mse'
        }
        for config, value in default_config.items():
            if config not in self.config:
                print('Config: \'{}\' is not specified. '
                      'A default value of {} will be used.'.format(
                          config, value))

    def get_default_configs(self):
        default_configs = {
            'dt_col': 'datetime',
            'target_col': 'value',
            'extra_features_col': None,
            'drop_missing': True,
            'future_seq_len': 1,
            'past_seq_len': 2,
            'batch_size': 64,
            'lr': 0.001,
            'dropout': 0.2,
            'epochs': 10,
            'metric': 'mean_squared_error'
        }
        print("**** default config: ****")
        for config in default_configs:
            print(config + ":", default_configs[config])
        print(
            "You can change any fields in the default configs by passing into "
            "fit_with_fixed_configs(). Otherwise, the default values will be used."
        )
        return default_configs

    def fit_with_fixed_configs(self,
                               input_df,
                               validation_df=None,
                               mc=False,
                               **user_configs):
        """
        Fit pipeline with fixed configs. The model will be trained from initialization
        with the hyper-parameter specified in configs. The configs contain both identity configs
        (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs
        (Eg. "past_seq_len", "batch_size").
        We recommend calling get_default_configs to see the name and default values of configs you
        you can specify.
        :param input_df: one data frame or a list of data frames
        :param validation_df: one data frame or a list of data frames
        :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs"
        :return:
        """
        # self._check_configs()
        if self.config is None:
            self.config = self.get_default_configs()
        if user_configs is not None:
            self.config.update(user_configs)
        ft_id_config_set = {
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        }
        ft_id_configs = {a: self.config[a] for a in ft_id_config_set}
        self.feature_transformers = TimeSequenceFeatureTransformer(
            **ft_id_configs)
        model_id_config_set = {'future_seq_len'}
        ft_id_configs = {a: self.config[a] for a in model_id_config_set}
        self.model = TimeSequenceModel(check_optional_config=False,
                                       **ft_id_configs)
        all_available_features = self.feature_transformers.get_feature_list(
            input_df)
        self.config.update({"selected_features": all_available_features})
        (x_train, y_train) = self.feature_transformers.fit_transform(
            input_df, **self.config)
        if self._is_val_df_valid(validation_df):
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None

        self.model.fit_eval(x_train,
                            y_train,
                            validation_data=validation_data,
                            mc=mc,
                            verbose=1,
                            **self.config)

    def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'):
        """
        evaluate the pipeline
        :param input_df:
        :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE']
        :param multioutput: string in ['raw_values', 'uniform_average']
                'raw_values' :
                    Returns a full set of errors in case of multioutput input.
                'uniform_average' :
                    Errors of all outputs are averaged with uniform weight.
        :return:
        """
        if isinstance(metrics, str):
            metrics = [metrics]
        # if not isinstance(metrics, list):
        #    raise ValueError("Expected metrics to be a list!")

        x, y = self.feature_transformers.transform(input_df, is_train=True)
        y_pred = self.model.predict(x)
        if y_pred.shape[1] == 1:
            multioutput = 'uniform_average'
        y_unscale, y_pred_unscale = self.feature_transformers.post_processing(
            input_df, y_pred, is_train=True)

        return [
            Evaluator.evaluate(m,
                               y_unscale,
                               y_pred_unscale,
                               multioutput=multioutput) for m in metrics
        ]

    def predict(self, input_df):
        """
        predict test data with the pipeline fitted
        :param input_df:
        :return:
        """
        x, _ = self.feature_transformers.transform(input_df, is_train=False)
        y_pred = self.model.predict(x)
        y_output = self.feature_transformers.post_processing(input_df,
                                                             y_pred,
                                                             is_train=False)
        return y_output

    def predict_with_uncertainty(self, input_df, n_iter=100):
        x, _ = self.feature_transformers.transform(input_df, is_train=False)
        y_pred, y_pred_uncertainty = self.model.predict_with_uncertainty(
            x=x, n_iter=n_iter)
        y_output = self.feature_transformers.post_processing(input_df,
                                                             y_pred,
                                                             is_train=False)
        y_uncertainty = self.feature_transformers.unscale_uncertainty(
            y_pred_uncertainty)
        return y_output, y_uncertainty

    def save(self, ppl_file=None):
        """
        save pipeline to file, contains feature transformer, model, trial config.
        :param ppl_file:
        :return:
        """
        ppl_file = ppl_file or os.path.join(
            DEFAULT_PPL_DIR, "{}_{}.ppl".format(self.name, self.time))
        save_zip(ppl_file, self.feature_transformers, self.model, self.config)
        print("Pipeline is saved in", ppl_file)
        return ppl_file

    def config_save(self, config_file=None):
        """
        save all configs to file.
        :param config_file:
        :return:
        """
        config_file = config_file or os.path.join(
            DEFAULT_CONFIG_DIR, "{}_{}.json".format(self.name, self.time))
        save_config(config_file, self.config, replace=True)
        return config_file
예제 #4
0
    def _hp_search(self, input_df, validation_df, metric):
        # features
        # feature_list = ["WEEKDAY(datetime)", "HOUR(datetime)",
        #                "PERCENTILE(value)", "IS_WEEKEND(datetime)",
        #                "IS_AWAKE(datetime)", "IS_BUSY_HOURS(datetime)"
        #                # "DAY(datetime)","MONTH(datetime)", #probabaly not useful
        #                ]
        # target_list = ["value"]
        # ft = TimeSequenceFeatures(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col)

        # ft = DummyTimeSequenceFeatures(file_path='../../../../data/nyc_taxi_rolled_split.npz')
        ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)

        feature_list = ft.get_feature_list(input_df)
        # model
        model = VanillaLSTM(check_optional_config=False,
                            future_seq_len=self.future_seq_len)

        search_space = {
            # -------- feature related parameters
            "selected_features":
            RandomSample(lambda spec: np.random.choice(
                feature_list,
                size=np.random.randint(low=3, high=len(feature_list), size=1),
                replace=False)),

            # --------- model related parameters
            # 'input_shape_x': x_train.shape[1],
            # 'input_shape_y': x_train.shape[-1],
            'out_units':
            self.future_seq_len,
            "lr":
            0.001,
            "lstm_1_units":
            GridSearch([16, 32]),
            "dropout_1":
            0.2,
            "lstm_2_units":
            10,
            "dropout_2":
            RandomSample(lambda spec: np.random.uniform(0.2, 0.5)),
            "batch_size":
            1024,
        }

        stop = {"reward_metric": -0.05, "training_iteration": 10}

        searcher = RayTuneSearchEngine(logs_dir=self.logs_dir,
                                       ray_num_cpus=6,
                                       resources_per_trial={"cpu": 2})
        searcher.compile(
            input_df,
            search_space=search_space,
            stop=stop,
            # feature_transformers=TimeSequenceFeatures,
            feature_transformers=ft,  # use dummy features for testing the rest
            model=model,
            validation_df=validation_df,
            metric=metric)
        # searcher.test_run()

        trials = searcher.run()
        best = searcher.get_best_trials(
            k=1)[0]  # get the best one trial, later could be n
        pipeline = self._make_pipeline(
            best,
            feature_transformers=ft,
            # feature_transformers=TimeSequenceFeatures(
            #     file_path='../../../../data/nyc_taxi_rolled_split.npz'),
            model=VanillaLSTM(check_optional_config=False))
        return pipeline