Python TransformerPipeline.transform 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: oolearning.transformers.TransformerPipeline

클래스/타입: TransformerPipeline

메소드/함수: transform

hotexamples.com에서의 예제들: 4

Python TransformerPipeline.transform - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 oolearning.transformers.TransformerPipeline.TransformerPipeline.transform에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TransformerPipeline(9)

fit_transform(6)

transform(4)

peak(3)

get_expected_columns(2)

예제 #1

파일 보기

파일: StratifiedMonteCarloResampler.py 프로젝트: shane-kercheval/oo-learning

    def _resample(self,
                  data_x: pd.DataFrame,
                  data_y: np.ndarray,
                  hyper_params: HyperParamsBase = None) -> ResamplerResults:

        result_evaluators = list()
        training_indexes, test_indexes = self._stratified_splitter.split_monte_carlo(target_values=data_y,
                                                                                     samples=self._repeats,
                                                                                     seed=42)
        for train_ind, test_ind in zip(training_indexes, test_indexes):
                train_x_not_transformed, holdout_x_not_transformed = data_x[train_ind], data_x[test_ind]
                train_y, test_y = data_y[train_ind], data_y[test_ind]

                pipeline = TransformerPipeline(transformations=self._transformations)
                train_x_transformed = pipeline.fit_transform(data_x=train_x_not_transformed)
                holdout_x_transformed = pipeline.transform(data_x=holdout_x_not_transformed)

                if self._train_callback is not None:
                    self._train_callback(train_x_transformed, data_y, hyper_params)

                model_copy = self._model.clone()  # need to reuse this object type for each fold/repeat
                model_copy.train(data_x=train_x_not_transformed, data_y=train_y, hyper_params=hyper_params)

                # for each evaluator, add the metric name/value to a dict to add to the ResamplerResults
                fold_evaluators = list()
                for evaluator in self._scores:
                    evaluator_copy = evaluator.clone()  # need to reuse this object type for each fold/repeat
                    evaluator_copy.calculate(actual_values=test_y,
                                             predicted_values=model_copy.predict(data_x=holdout_x_transformed))  # noqa
                    fold_evaluators.append(evaluator_copy)
                result_evaluators.append(fold_evaluators)

        return ResamplerResults(scores=result_evaluators, decorators=None)

예제 #2

파일 보기

파일: OOLearningHelpers.py 프로젝트: shane-kercheval/oo-learning

    def get_final_datasets(data, target_variable, splitter, transformations):

        # if we have a splitter, split into training and holdout, else just do transformations on all data
        if splitter:
            training_indexes, holdout_indexes = splitter.split(
                target_values=data[target_variable])
        else:
            training_indexes, holdout_indexes = range(len(data)), []

        training_y = data.iloc[training_indexes][target_variable]
        training_x = data.iloc[training_indexes].drop(columns=target_variable)

        holdout_y = data.iloc[holdout_indexes][target_variable]
        holdout_x = data.iloc[holdout_indexes].drop(columns=target_variable)

        # transform on training data
        if transformations is not None:
            # before we train the data, we actually want to 'snoop' at what the expected columns will be with
            # ALL the data. The reason is that if we so some sort of dummy encoding, but not all the
            # categories are included in the training set (i.e. maybe only a small number of observations have
            # the categoric value), then we can still ensure that we will be giving the same expected columns/
            # encodings to the predict method with the holdout set.
            # noinspection PyTypeChecker
            expected_columns = TransformerPipeline.get_expected_columns(
                data=data.drop(columns=target_variable),  # noqa
                transformations=transformations)
            transformer = StatelessTransformer(
                custom_function=lambda x_df: x_df.reindex(
                    columns=expected_columns,  # noqa
                    fill_value=0))
            transformations = transformations + [transformer]

        pipeline = TransformerPipeline(transformations=transformations)
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.

        # peak at all the data (except for the target variable of course)
        # noinspection PyTypeChecker
        pipeline.peak(data_x=data.drop(columns=target_variable))

        # fit on only the train data-set (and also transform)
        transformed_training_x = pipeline.fit_transform(training_x)

        if holdout_indexes:
            transformed_holdout_x = pipeline.transform(holdout_x)
        else:
            transformed_holdout_x = holdout_x

        return transformed_training_x, training_y, transformed_holdout_x, holdout_y, pipeline

예제 #3

파일 보기

파일: ModelTrainer.py 프로젝트: shane-kercheval/oo-learning

class ModelTrainer:
    """
    ModelTrainer encapsulates the (mundane and repetitive) logic of the general process of training a model,
        including:

        - splitting the data into training and holdout sets
        - data transformations & pre-processing
        - training a model
        - predicting on a holdout data-set, or on future data (applying the same transformations)
        - evaluate the performance of the model on a holdout set
    """
    def __init__(self,
                 model: ModelWrapperBase,
                 model_transformations: Union[List[TransformerBase],
                                              None] = None,
                 splitter: DataSplitterBase = None,
                 evaluator: EvaluatorBase = None,
                 scores: List[ScoreBase] = None,
                 persistence_manager: PersistenceManagerBase = None,
                 train_callback: Callable[
                     [pd.DataFrame, np.ndarray, Union[HyperParamsBase,
                                                      None]], None] = None):
        """

        :param model: a class representing the model to train_predict_eval
        :param model_transformations: a list of transformations to apply before training (and predicting)
        :param splitter: a class encapsulating the logic of splitting the data into training and holdout sets;
            if None, then no split occurs, and the model is trained on all the data (and so no holdout
            evaluator or scores are available).
        :param evaluator: a class encapsulating the logic of evaluating a holdout set
        :param scores: a list of Score objects
        :param persistence_manager: a PersistenceManager defining how the underlying models should be cached,
            optional.
        :param train_callback: a callback that is called before the model is trained, which returns the
           data_x, data_y, and hyper_params that are passed into `ModelWrapper.train_predict_eval()`.
           The primary intent is for unit tests to have the ability to ensure that the data (data_x) is
           being transformed as expected, but it is imaginable to think that users will also benefit
           from this capability to also peak at the data that is being trained.
        """
        assert isinstance(model, ModelWrapperBase)
        self._model = model
        self._splitter = splitter
        self._training_evaluator = evaluator
        # copy so that we can use 'same' evaluator type in the holdout evaluator
        self._holdout_evaluator = copy.deepcopy(evaluator)
        self._training_scores = scores
        self._holdout_scores = None if scores is None else [
            x.clone() for x in scores
        ]
        self._has_fitted = False
        self._persistence_manager = persistence_manager
        self._train_callback = train_callback

        if model_transformations is not None:
            assert isinstance(model_transformations, list)
            assert all([
                isinstance(x, TransformerBase) for x in model_transformations
            ])

        self._model_transformations = model_transformations
        self._pipeline = None

    def __str__(self):
        val = str(self.model)

        # either show evaluator info or scores
        if self.training_evaluator is not None:
            val += "\n\nTraining Evaluator\n==================\n"
            val += "\n" + str(self.training_evaluator)

            if self.holdout_evaluator is not None:
                val += "\n\nHoldout Evaluator\n=================\n"
                val += "\n" + str(self.holdout_evaluator)

        else:
            if self.training_scores is not None:
                val += "\n\nTraining Scores\n===============\n"
                for score in self.training_scores:
                    val += "\n" + str(score)

                if self.holdout_scores is not None:
                    val += "\n\nHoldout Scores\n==============="
                    for score in self.holdout_scores:
                        val += "\n" + str(score)

        return val

    @property
    def model(self) -> ModelWrapperBase:
        """
        :return: underlying model object
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._model

    def set_persistence_manager(self,
                                persistence_manager: PersistenceManagerBase):
        """
        Sets the persistence manager, defining how the underlying model should be cached
        :param persistence_manager:
        :return:
        """
        self._persistence_manager = persistence_manager

    @staticmethod
    def _build_cache_key(model: ModelWrapperBase,
                         hyper_params: HyperParamsBase) -> str:
        """
        helper function to build the cache key (e.g. file name)
        """
        model_name = model.name
        if hyper_params is None:
            key = model_name
        else:
            # if hyper-params, flatten out list of param names and values and concatenate/join them together
            hyper_params_long = '_'.join(
                list(
                    sum([(str(x), str(y))
                         for x, y in hyper_params.params_dict.items()],
                        ())))  # noqa
            return model_name + '_' + hyper_params_long

        return key

    def train_predict_eval(self,
                           data: pd.DataFrame,
                           target_variable: Union[str, None] = None,
                           hyper_params: HyperParamsBase = None) -> np.ndarray:
        """
        The data is split into a training/holdout set if a Splitter is provided. If not provided, no split
            occurs and the model is trained on all the `data`). Before training, the data is transformed
            by the specified Transformation objects. If a Splitter is provided, the transformations are
            'fit/transformed' on the training and only transformed on the holdout.

        Trains the data on the model, predicts, and evaluates the predictions if an Evaluator or Scores are
            passed in.
            If a Splitter is provide, the predictions that are returned are of the holdout set. Otherwise,
            the predictions form the training set are returned.

        :param data: data to split (if Splitter is provided) and train_predict_eval the model on
        :param target_variable: the name of the target variable/column
        :param hyper_params: a corresponding HyperParams object
        """
        if self._has_fitted:
            raise ModelAlreadyFittedError()

        if self._splitter:
            assert target_variable is not None
            training_indexes, holdout_indexes = self._splitter.split(
                target_values=data[target_variable])
        else:  # we are fitting the entire data-set, no such thing as a holdout data-set/evaluator/scores
            training_indexes, holdout_indexes = range(len(data)), []
            self._holdout_evaluator = None
            self._holdout_scores = None

        # for unsupervised problems, there might not be a target variable;
        # in that case, there will also not be a training_y/holding_y
        training_y = data.iloc[training_indexes][
            target_variable] if target_variable is not None else None
        training_x = data.iloc[training_indexes]

        holdout_y = data.iloc[holdout_indexes][
            target_variable] if target_variable is not None else None
        holdout_x = data.iloc[holdout_indexes]

        if target_variable is not None:
            training_x = training_x.drop(columns=target_variable)
            holdout_x = holdout_x.drop(columns=target_variable)

        # transform/train_predict_eval on training data
        if self._model_transformations is not None:
            # before we train_predict_eval the data, we actually want to 'snoop' at what the expected columns
            # will be with ALL the data. The reason is that if we so some sort of dummy encoding, but not all
            # the categories are included in the training set (i.e. maybe only a small number of observations
            # have the categoric value), then we can still ensure that we will be giving the same expected
            # columns/encodings to the predict method with the holdout set.
            expected_columns = TransformerPipeline.\
                get_expected_columns(data=data if target_variable is None else data.drop(columns=target_variable),  # noqa
                                     transformations=self._model_transformations)
            transformer = StatelessTransformer(
                custom_function=lambda x_df: x_df.reindex(
                    columns=expected_columns,  # noqa
                    fill_value=0))
            self._model_transformations = self._model_transformations + [
                transformer
            ]

        self._pipeline = TransformerPipeline(
            transformations=self._model_transformations)
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.

        # peak at all the data (except for the target variable of course)
        # noinspection PyTypeChecker
        self._pipeline.peak(
            data_x=data if target_variable is None else data.drop(
                columns=target_variable))
        # fit on only the train_predict_eval data-set (and also transform)
        transformed_training_data = self._pipeline.fit_transform(training_x)

        # set up persistence if applicable
        if self._persistence_manager is not None:  # then build the key
            cache_key = ModelTrainer._build_cache_key(
                model=self._model, hyper_params=hyper_params)
            self._persistence_manager.set_key(key=cache_key)
            self._model.set_persistence_manager(
                persistence_manager=self._persistence_manager)

        if self._train_callback is not None:
            self._train_callback(transformed_training_data, training_y,
                                 hyper_params)

        # train_predict_eval the model with the transformed training data
        self._model.train(data_x=transformed_training_data,
                          data_y=training_y,
                          hyper_params=hyper_params)

        self._has_fitted = True

        training_predictions = self.predict(data_x=training_x)
        holdout_predictions = None
        if self._splitter is not None:
            holdout_predictions = self.predict(data_x=holdout_x)

        # if evaluators, evaluate on both the training and holdout set
        if self._training_evaluator is not None:
            # predict will apply the transformations (which are fitted on the training data)
            self._training_evaluator.evaluate(
                actual_values=training_y,
                predicted_values=training_predictions)
            if self._holdout_evaluator:
                self._holdout_evaluator.evaluate(
                    actual_values=holdout_y,
                    predicted_values=holdout_predictions)

        # if scores, score on both the training and holdout set
        if self._training_scores is not None:
            # predict will apply the transformations (which are fitted on the training data)
            for score in self._training_scores:
                ScoreMediator.calculate(score=score,
                                        data_x=transformed_training_data,
                                        actual_target_variables=training_y,
                                        predicted_values=training_predictions)

            if self._holdout_scores:
                for score in self._holdout_scores:
                    ScoreMediator.calculate(
                        score=score,
                        data_x=
                        holdout_x,  # TODO may have to manually do transformations
                        actual_target_variables=holdout_y,
                        predicted_values=holdout_predictions)

        return training_predictions if self._splitter is None else holdout_predictions

    def predict(self, data_x: pd.DataFrame) -> np.ndarray:
        """
        `predict` handles the logic of applying the transformations (same transformations that were applied to
            the training data, as well as predicted data
        :param data_x: unprocessed DataFrame (unprocessed in terms of the model specific transformation
            pipeline, i.e. exactly the same transformations should be applied to this data as was used on the
            training data
        :return: predicted values
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        prepared_prediction_set = self._pipeline.transform(data_x)

        predictions = self._model.predict(data_x=prepared_prediction_set)
        if isinstance(predictions, pd.DataFrame):
            # noinspection PyTypeChecker
            assert all(predictions.index.values == data_x.index.values)

        return predictions

    @property
    def training_evaluator(self) -> Union[EvaluatorBase, None]:
        """
        :return: if an Evaluator was provided via class constructor, returns the object evaluated on the
            training data
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._training_evaluator

    @property
    def holdout_evaluator(self) -> Union[EvaluatorBase, None]:
        """
        :return: if an Evaluator *and* a Splitter (thus creating a holdout set before training) were provided
            via class constructor, returns the object evaluated on the holdout data
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._holdout_evaluator

    @property
    def training_scores(self) -> Union[List[ScoreBase], None]:
        """
        :return: if a list of Scores was provided via class constructor, returns the list of Scores calculated
            on the training data.
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._training_scores

    @property
    def holdout_scores(self) -> Union[List[ScoreBase], None]:
        """
        :return: if list of Scores *and* a Splitter (thus creating a holdout set before training) were
            provided via class constructor, returns the list of Scores evaluated on the holdout data
        """
        if self._has_fitted is False:
            raise ModelNotFittedError()

        return self._holdout_scores

예제 #4

파일 보기

파일: RepeatedCrossValidationResampler.py 프로젝트: shane-kercheval/oo-learning

def resample_repeat(args):
    """
    NOTE: parallelization is per "repeat", not per "fold". This is because decorators can be used (and
        retained/cached) across folds, which would break if we split up and parallelized the logic
    """
    folds = args['folds']
    repeat_index = args['repeat_index']
    data_x = args['data_x']
    data_y = args['data_y']
    transformer_factory = args['transformer_factory']
    train_callback = args['train_callback']
    hyper_params = args['hyper_params']
    model_factory = args['model_factory']
    persistence_manager = args['persistence_manager']
    score_factory = args['score_factory']
    decorators = args['decorators']

    # consistent folds per repeat index, but different folds for different repeats
    np.random.seed(repeat_index)
    # generate random fold #s that correspond to each index of the data
    random_folds = np.random.randint(low=0, high=folds, size=len(data_y))

    result_scores = list()  # list of all the `evaluated` holdout scores

    for fold_index in range(folds):
        holdout_indexes = random_folds == fold_index  # indexes that match the fold belong to holdout
        training_indexes = ~holdout_indexes  # all other indexes belong to the training set

        # odd naming serves as distinction between when we use transformed/non-transformed data
        train_x_not_transformed, holdout_x_not_transformed = data_x[training_indexes], \
                                                             data_x[holdout_indexes]
        train_y, holdout_y = data_y[training_indexes], data_y[holdout_indexes]

        # NOTE: we are fitting the transformations on the k-1 folds (i.e. local training data)
        # for each k times we train/predict data. This is so we don't have any contamination/
        # leakage into the local holdout/fold we are predicting on (just like we wouldn't fit
        # the transformations on the entire dataset; we fit/transform on the training and then
        # simply transform on the holdout
        pipeline = TransformerPipeline(
            transformations=transformer_factory.get())
        # before we fit the data, we actually want to 'peak' at what the expected columns will be with
        # ALL the data. The reason is that if we so some sort of encoding (dummy/one-hot), but not all
        # of the categories are included in the training set (i.e. maybe only a small number of
        # observations have the categoric value), then we can still ensure that we will be giving the
        # same expected columns/encodings to the `predict` method with the holdout set.
        # peak at all the data
        pipeline.peak(data_x=data_x)
        # fit on only the train dataset (and also transform)
        train_x_transformed = pipeline.fit_transform(
            data_x=train_x_not_transformed)
        # transform (but don't fit) on holdout
        holdout_x_transformed = pipeline.transform(
            data_x=holdout_x_not_transformed)

        # the callback allows callers to see/verify the data that is being trained, at each fold
        if train_callback is not None:
            train_callback(train_x_transformed, data_y, hyper_params)

        model = model_factory.get_model(
        )  # need to reuse this object type for each fold/repeat

        # set up persistence if applicable
        if persistence_manager is not None:  # then build the key
            # first set the key_prefix; separating the repeat/fold information from the rest of the key
            # let's models (e.g. ModelStacker) utilize the key_prefix, while modifying the key
            persistence_manager.set_key_prefix(
                prefix='repeat{}_fold{}_'.format(str(repeat_index),
                                                 str(fold_index)))
            cache_key = model_build_cache_key(model=model,
                                              hyper_params=hyper_params)
            persistence_manager.set_key(key=cache_key)
            model.set_persistence_manager(
                persistence_manager=persistence_manager)

        model.train(data_x=train_x_transformed,
                    data_y=train_y,
                    hyper_params=hyper_params)
        predicted_values = model.predict(data_x=holdout_x_transformed)

        fold_scores = list()
        for score in score_factory.get(
        ):  # cycle through scores and store results of each fold
            score.calculate(actual_values=holdout_y,
                            predicted_values=predicted_values)
            fold_scores.append(score)
        result_scores.append(fold_scores)

        # executed any functionality that is dynamically attached via decorators
        if decorators:
            for decorator in decorators:
                decorator.decorate(
                    repeat_index=repeat_index,
                    fold_index=fold_index,
                    scores=score_factory.get(),
                    holdout_actual_values=holdout_y,
                    holdout_predicted_values=predicted_values,
                    holdout_indexes=holdout_x_transformed.index.values,
                    model=model,
                    transformer_pipeline=pipeline)
    return result_scores, decorators