예제 #1
0
            def _setup(self, config):
                # print("config in set up is", config)
                global_ft = ray.get(ft_id)
                # global_model = ray.get(model_id)
                self.trial_ft = deepcopy(global_ft)
                self.trial_model = TimeSequenceModel(check_optional_config=False,
                                                     future_seq_len=future_seq_len)

                # handling input
                global_input_df = ray.get(input_df_id)
                trial_input_df = deepcopy(global_input_df)
                self.config = convert_bayes_configs(config).copy()
                (self.x_train, self.y_train) = self.trial_ft.fit_transform(trial_input_df,
                                                                           **self.config)
                # trial_ft.fit(trial_input_df, **config)

                # handling validation data
                self.validation_data = None
                if is_val_df_valid:
                    global_validation_df = ray.get(validation_df_id)
                    trial_validation_df = deepcopy(global_validation_df)
                    self.validation_data = self.trial_ft.transform(trial_validation_df)

                # no need to call build since it is called the first time fit_eval is called.
                # callbacks = [TuneCallback(tune_reporter)]
                # fit model
                self.best_reward_m = -999
                self.reward_m = -999
                self.ckpt_name = "pipeline.ckpt"
예제 #2
0
        def train_func(config, tune_reporter):
            # make a copy from global variables for trial to make changes
            global_ft = ray.get(ft_id)
            # global_model = ray.get(model_id)
            trial_ft = deepcopy(global_ft)
            # trial_model = deepcopy(global_model)
            trial_model = TimeSequenceModel(check_optional_config=False,
                                            future_seq_len=future_seq_len)

            # handling input
            global_input_df = ray.get(input_df_id)
            trial_input_df = deepcopy(global_input_df)
            config = convert_bayes_configs(config).copy()
            # print("config is ", config)
            (x_train, y_train) = trial_ft.fit_transform(trial_input_df, **config)
            # trial_ft.fit(trial_input_df, **config)

            # handling validation data
            validation_data = None
            if is_val_df_valid:
                global_validation_df = ray.get(validation_df_id)
                trial_validation_df = deepcopy(global_validation_df)
                validation_data = trial_ft.transform(trial_validation_df)

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward_m = -999
            reward_m = -999
            for i in range(1, 101):
                result = trial_model.fit_eval(x_train,
                                              y_train,
                                              validation_data=validation_data,
                                              mc=mc,
                                              # verbose=1,
                                              **config)
                reward_m = metric_op * result
                # if metric == "mean_squared_error":
                #     reward_m = (-1) * result
                #     # print("running iteration: ",i)
                # elif metric == "r_square":
                #     reward_m = result
                # else:
                #     raise ValueError("metric can only be \"mean_squared_error\" or \"r_square\"")
                ckpt_name = "best.ckpt"
                if reward_m > best_reward_m:
                    best_reward_m = reward_m
                    save_zip(ckpt_name, trial_ft, trial_model, config)
                    if remote_dir is not None:
                        upload_ppl_hdfs(remote_dir, ckpt_name)

                tune_reporter(
                    training_iteration=i,
                    reward_metric=reward_m,
                    checkpoint="best.ckpt"
                )
예제 #3
0
    def fit_with_fixed_configs(self,
                               input_df,
                               validation_df=None,
                               mc=False,
                               **user_configs):
        """
        Fit pipeline with fixed configs. The model will be trained from initialization
        with the hyper-parameter specified in configs. The configs contain both identity configs
        (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs
        (Eg. "past_seq_len", "batch_size").
        We recommend calling get_default_configs to see the name and default values of configs you
        you can specify.
        :param input_df: one data frame or a list of data frames
        :param validation_df: one data frame or a list of data frames
        :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs"
        :return:
        """
        # self._check_configs()
        if self.config is None:
            self.config = self.get_default_configs()
        if user_configs is not None:
            self.config.update(user_configs)
        ft_id_config_set = {
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        }
        ft_id_configs = {a: self.config[a] for a in ft_id_config_set}
        self.feature_transformers = TimeSequenceFeatureTransformer(
            **ft_id_configs)
        model_id_config_set = {'future_seq_len'}
        ft_id_configs = {a: self.config[a] for a in model_id_config_set}
        self.model = TimeSequenceModel(check_optional_config=False,
                                       **ft_id_configs)
        all_available_features = self.feature_transformers.get_feature_list(
            input_df)
        self.config.update({"selected_features": all_available_features})
        (x_train, y_train) = self.feature_transformers.fit_transform(
            input_df, **self.config)
        if self._is_val_df_valid(validation_df):
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None

        self.model.fit_eval(x_train,
                            y_train,
                            validation_data=validation_data,
                            mc=mc,
                            verbose=1,
                            **self.config)
예제 #4
0
    def _hp_search(self,
                   input_df,
                   validation_df,
                   metric,
                   recipe,
                   mc,
                   resources_per_trial,
                   remote_dir):

        ft = TimeSequenceFeatureTransformer(self.future_seq_len,
                                            self.dt_col,
                                            self.target_col,
                                            self.extra_features_col,
                                            self.drop_missing)
        if isinstance(input_df, list):
            feature_list = ft.get_feature_list(input_df[0])
        else:
            feature_list = ft.get_feature_list(input_df)

        # model = VanillaLSTM(check_optional_config=False)
        model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len)

        # prepare parameters for search engine
        search_space = recipe.search_space(feature_list)
        runtime_params = recipe.runtime_params()
        num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        search_algorithm_params = recipe.search_algorithm_params()
        search_algorithm = recipe.search_algorithm()
        fixed_params = recipe.fixed_params()
        del stop['num_samples']

        searcher = RayTuneSearchEngine(logs_dir=self.logs_dir,
                                       resources_per_trial=resources_per_trial,
                                       name=self.name,
                                       remote_dir=remote_dir,
                                       )
        searcher.compile(input_df,
                         search_space=search_space,
                         stop=stop,
                         search_algorithm_params=search_algorithm_params,
                         search_algorithm=search_algorithm,
                         fixed_params=fixed_params,
                         # feature_transformers=TimeSequenceFeatures,
                         feature_transformers=ft,
                         # model=model,
                         future_seq_len=self.future_seq_len,
                         validation_df=validation_df,
                         metric=metric,
                         mc=mc,
                         num_samples=num_samples)
        # searcher.test_run()
        searcher.run()

        best = searcher.get_best_trials(k=1)[0]  # get the best one trial, later could be n
        pipeline = self._make_pipeline(best,
                                       feature_transformers=ft,
                                       model=model,
                                       remote_dir=remote_dir)
        return pipeline
예제 #5
0
def load_ts_pipeline(file):
    feature_transformers = TimeSequenceFeatureTransformer()
    model = TimeSequenceModel(check_optional_config=False)

    all_config = restore_zip(file, feature_transformers, model)
    ts_pipeline = TimeSequencePipeline(feature_transformers=feature_transformers,
                                       model=model,
                                       config=all_config)
    print("Restore pipeline from", file)
    return ts_pipeline
예제 #6
0
class TimeSequencePipeline(Pipeline):
    def __init__(self,
                 feature_transformers=None,
                 model=None,
                 config=None,
                 name=None):
        """
        initialize a pipeline
        :param model: the internal model
        :param feature_transformers: the feature transformers
        """
        self.feature_transformers = feature_transformers
        self.model = model
        self.config = config
        self.name = name
        self.time = time.strftime("%Y%m%d-%H%M%S")

    def describe(self):
        init_info = [
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        ]
        print("**** Initialization info ****")
        for info in init_info:
            print(info + ":", self.config[info])
        print("")

    def fit(self, input_df, validation_df=None, mc=False, epoch_num=20):
        x, y = self.feature_transformers.transform(input_df, is_train=True)
        if validation_df is not None and not validation_df.empty:
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None
        new_config = {'epochs': epoch_num}
        self.model.fit_eval(x,
                            y,
                            validation_data,
                            mc=mc,
                            verbose=1,
                            **new_config)
        print('Fit done!')

    def _is_val_df_valid(self, validation_df):
        df_not_empty = isinstance(validation_df,
                                  pd.DataFrame) and not validation_df.empty
        df_list_not_empty = isinstance(validation_df, list) \
            and validation_df and not all([d.empty for d in validation_df])
        if validation_df is not None and (df_not_empty or df_list_not_empty):
            return True
        else:
            return False

    def _check_configs(self):
        required_configs = {'future_seq_len'}
        if not self.config.keys() & required_configs:
            raise ValueError("Missing required parameters in configuration. " +
                             "Required parameters are: " +
                             str(required_configs))
        default_config = {
            'dt_col': 'datetime',
            'target_col': 'value',
            'extra_features_col': None,
            'drop_missing': True,
            'past_seq_len': 2,
            'batch_size': 64,
            'lr': 0.001,
            'dropout': 0.2,
            'epochs': 10,
            'metric': 'mse'
        }
        for config, value in default_config.items():
            if config not in self.config:
                print('Config: \'{}\' is not specified. '
                      'A default value of {} will be used.'.format(
                          config, value))

    def get_default_configs(self):
        default_configs = {
            'dt_col': 'datetime',
            'target_col': 'value',
            'extra_features_col': None,
            'drop_missing': True,
            'future_seq_len': 1,
            'past_seq_len': 2,
            'batch_size': 64,
            'lr': 0.001,
            'dropout': 0.2,
            'epochs': 10,
            'metric': 'mean_squared_error'
        }
        print("**** default config: ****")
        for config in default_configs:
            print(config + ":", default_configs[config])
        print(
            "You can change any fields in the default configs by passing into "
            "fit_with_fixed_configs(). Otherwise, the default values will be used."
        )
        return default_configs

    def fit_with_fixed_configs(self,
                               input_df,
                               validation_df=None,
                               mc=False,
                               **user_configs):
        """
        Fit pipeline with fixed configs. The model will be trained from initialization
        with the hyper-parameter specified in configs. The configs contain both identity configs
        (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs
        (Eg. "past_seq_len", "batch_size").
        We recommend calling get_default_configs to see the name and default values of configs you
        you can specify.
        :param input_df: one data frame or a list of data frames
        :param validation_df: one data frame or a list of data frames
        :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs"
        :return:
        """
        # self._check_configs()
        if self.config is None:
            self.config = self.get_default_configs()
        if user_configs is not None:
            self.config.update(user_configs)
        ft_id_config_set = {
            'future_seq_len', 'dt_col', 'target_col', 'extra_features_col',
            'drop_missing'
        }
        ft_id_configs = {a: self.config[a] for a in ft_id_config_set}
        self.feature_transformers = TimeSequenceFeatureTransformer(
            **ft_id_configs)
        model_id_config_set = {'future_seq_len'}
        ft_id_configs = {a: self.config[a] for a in model_id_config_set}
        self.model = TimeSequenceModel(check_optional_config=False,
                                       **ft_id_configs)
        all_available_features = self.feature_transformers.get_feature_list(
            input_df)
        self.config.update({"selected_features": all_available_features})
        (x_train, y_train) = self.feature_transformers.fit_transform(
            input_df, **self.config)
        if self._is_val_df_valid(validation_df):
            validation_data = self.feature_transformers.transform(
                validation_df)
        else:
            validation_data = None

        self.model.fit_eval(x_train,
                            y_train,
                            validation_data=validation_data,
                            mc=mc,
                            verbose=1,
                            **self.config)

    def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'):
        """
        evaluate the pipeline
        :param input_df:
        :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE']
        :param multioutput: string in ['raw_values', 'uniform_average']
                'raw_values' :
                    Returns a full set of errors in case of multioutput input.
                'uniform_average' :
                    Errors of all outputs are averaged with uniform weight.
        :return:
        """
        if isinstance(metrics, str):
            metrics = [metrics]
        # if not isinstance(metrics, list):
        #    raise ValueError("Expected metrics to be a list!")

        x, y = self.feature_transformers.transform(input_df, is_train=True)
        y_pred = self.model.predict(x)
        if y_pred.shape[1] == 1:
            multioutput = 'uniform_average'
        y_unscale, y_pred_unscale = self.feature_transformers.post_processing(
            input_df, y_pred, is_train=True)

        return [
            Evaluator.evaluate(m,
                               y_unscale,
                               y_pred_unscale,
                               multioutput=multioutput) for m in metrics
        ]

    def predict(self, input_df):
        """
        predict test data with the pipeline fitted
        :param input_df:
        :return:
        """
        x, _ = self.feature_transformers.transform(input_df, is_train=False)
        y_pred = self.model.predict(x)
        y_output = self.feature_transformers.post_processing(input_df,
                                                             y_pred,
                                                             is_train=False)
        return y_output

    def predict_with_uncertainty(self, input_df, n_iter=100):
        x, _ = self.feature_transformers.transform(input_df, is_train=False)
        y_pred, y_pred_uncertainty = self.model.predict_with_uncertainty(
            x=x, n_iter=n_iter)
        y_output = self.feature_transformers.post_processing(input_df,
                                                             y_pred,
                                                             is_train=False)
        y_uncertainty = self.feature_transformers.unscale_uncertainty(
            y_pred_uncertainty)
        return y_output, y_uncertainty

    def save(self, ppl_file=None):
        """
        save pipeline to file, contains feature transformer, model, trial config.
        :param ppl_file:
        :return:
        """
        ppl_file = ppl_file or os.path.join(
            DEFAULT_PPL_DIR, "{}_{}.ppl".format(self.name, self.time))
        save_zip(ppl_file, self.feature_transformers, self.model, self.config)
        print("Pipeline is saved in", ppl_file)
        return ppl_file

    def config_save(self, config_file=None):
        """
        save all configs to file.
        :param config_file:
        :return:
        """
        config_file = config_file or os.path.join(
            DEFAULT_CONFIG_DIR, "{}_{}.json".format(self.name, self.time))
        save_config(config_file, self.config, replace=True)
        return config_file
예제 #7
0
        class TrainableClass(Trainable):

            def _setup(self, config):
                # print("config in set up is", config)
                global_ft = ray.get(ft_id)
                # global_model = ray.get(model_id)
                self.trial_ft = deepcopy(global_ft)
                self.trial_model = TimeSequenceModel(check_optional_config=False,
                                                     future_seq_len=future_seq_len)

                # handling input
                global_input_df = ray.get(input_df_id)
                trial_input_df = deepcopy(global_input_df)
                self.config = convert_bayes_configs(config).copy()
                (self.x_train, self.y_train) = self.trial_ft.fit_transform(trial_input_df,
                                                                           **self.config)
                # trial_ft.fit(trial_input_df, **config)

                # handling validation data
                self.validation_data = None
                if is_val_df_valid:
                    global_validation_df = ray.get(validation_df_id)
                    trial_validation_df = deepcopy(global_validation_df)
                    self.validation_data = self.trial_ft.transform(trial_validation_df)

                # no need to call build since it is called the first time fit_eval is called.
                # callbacks = [TuneCallback(tune_reporter)]
                # fit model
                self.best_reward_m = -999
                self.reward_m = -999
                self.ckpt_name = "pipeline.ckpt"

            def _train(self):
                # print("self.config in train is ", self.config)
                result = self.trial_model.fit_eval(self.x_train, self.y_train,
                                                   validation_data=self.validation_data,
                                                   # verbose=1,
                                                   **self.config)
                self.reward_m = metric_op * result
                # if metric == "mean_squared_error":
                #     self.reward_m = (-1) * result
                #     # print("running iteration: ",i)
                # elif metric == "r_square":
                #     self.reward_m = result
                # else:
                #     raise ValueError("metric can only be \"mean_squared_error\" or \"r_square\"")
                return {"reward_metric": self.reward_m, "checkpoint": self.ckpt_name}

            def _save(self, checkpoint_dir):
                # print("checkpoint dir is ", checkpoint_dir)
                ckpt_name = self.ckpt_name
                # save in the working dir (without "checkpoint_{}".format(training_iteration))
                path = os.path.join(checkpoint_dir, "..", ckpt_name)
                # path = os.path.join(checkpoint_dir, ckpt_name)
                # print("checkpoint save path is ", checkpoint_dir)
                if self.reward_m > self.best_reward_m:
                    self.best_reward_m = self.reward_m
                    print("****this reward is", self.reward_m)
                    print("*********saving checkpoint")
                    save_zip(ckpt_name, self.trial_ft, self.trial_model, self.config)
                    if remote_dir is not None:
                        upload_ppl_hdfs(remote_dir, ckpt_name)
                return path

            def _restore(self, checkpoint_path):
                # print("checkpoint path in restore is ", checkpoint_path)
                if remote_dir is not None:
                    restore_hdfs(checkpoint_path, remote_dir, self.trial_ft, self.trial_model)
                else:
                    restore_zip(checkpoint_path, self.trial_ft, self.trial_model)
예제 #8
0
 def model_create_func():
     # model = VanillaLSTM(check_optional_config=False)
     model = TimeSequenceModel(
         check_optional_config=False,
         future_seq_len=self.future_seq_len)
     return model