Пример #1
0
    def test_prefix_columns_concatenator(self):
        data = get_dataset('iris').as_df()
        xf = PrefixColumnConcatenator(columns={
            'Spl': 'Sepal_',
            'Pet': 'Petal_'
        })
        features = xf.fit_transform(data)

        assert features.shape == (150, 11)
        assert set(features.columns) == {
            'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width',
            'Label', 'Species', 'Setosa', 'Spl.Sepal_Length',
            'Spl.Sepal_Width', 'Pet.Petal_Length', 'Pet.Petal_Width'
        }
Пример #2
0
    def test_combined_models_support_predict_proba_with_more_than_2_classes(
            self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(data)

        feature_cols = ['education', 'age']
        training_pipeline = Pipeline([
            DatasetTransformer(featurization_pipeline.model),
            OneVsRestClassifier(LogisticRegressionBinaryClassifier(),
                                feature=feature_cols,
                                label='induced')
        ])
        training_pipeline.fit(data, output_predictor_model=True)

        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator({'education': 'education.'})])
        concat_pipeline.fit(featurized_data)

        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(training_pipeline.predictor_model)

        concat_and_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)

        result = concat_and_predictor_pipeline.predict_proba(featurized_data)
        self.assertEqual(result.shape[1], 3)
Пример #3
0
    def test_vectorized_with_prefixconcat_output_predictor_model(self):
        """
        This test shows how to prepend ColumnConcatenator transform
        to outputted predictor model from combined (with featurizers) pipeline
        so it successfully runs on featurized data with vectors.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create, fit and score with combined model.
        # Output predictor model separately.
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # train ColumnConcatenator on featurized data
        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator(columns={'c0': 'c0.'})])
        concat_pipeline.fit(df)

        # Load predictor pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        # combine concat and predictor models and score
        combined_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)
        result_2 = combined_predictor_pipeline.predict(df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
Пример #4
0
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'PcaAnomalyDetector': PcaAnomalyDetector(rank=3),
 'PcaTransformer':  PcaTransformer(rank=2),
 'PixelExtractor': Pipeline([
     Loader(columns={'ImgPath': 'Path'}),
     PixelExtractor(columns={'ImgPixels': 'ImgPath'}),
 ]),
 'PrefixColumnConcatenator': PrefixColumnConcatenator(columns={'Features': 'Sepal_'}),
 'Resizer': Pipeline([
     Loader(columns={'ImgPath': 'Path'}),
     Resizer(image_width=227, image_height=227,
             columns={'ImgResize': 'ImgPath'})
 ]),
 'SkipFilter': SkipFilter(count=5),
 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'],
                                      seasonal_window_size=2),
 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'],
                                                 seasonal_window_size=2),
 'SsaForecaster': SsaForecaster(columns=['Sepal_Length'],
                                window_size=2,
                                series_length=5,
                                train_size=5,
                                horizon=1),
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import LogisticRegressionClassifier
from nimbusml.preprocessing.schema import PrefixColumnConcatenator
from nimbusml.preprocessing.schema import ColumnDropper
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

concat = PrefixColumnConcatenator() << {'Sepal': 'Sepal_'}
concat1 = PrefixColumnConcatenator() << {'Petal': 'Petal_'}
dropcols = ColumnDropper() << [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa',
    'Species'
]

pipeline = Pipeline(
    [concat, concat1, dropcols,
     LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)

# Evaluate the model
metrics, scores = pipeline.test(X_test, y_test, output_scores=True)
print(metrics)
Пример #6
0
###############################################################################
# PrefixColumnConcatenator
import numpy as np
import pandas as pd
from nimbusml.preprocessing.schema import PrefixColumnConcatenator

data = pd.DataFrame(data=dict(PrefixA=[2.5, np.nan, 2.1, 1.0],
                              PrefixB=[.75, .9, .8, .76],
                              AnotherColumn=[np.nan, 2.5, 2.6, 2.4]))

# transform usage
xf = PrefixColumnConcatenator(columns={'combined': 'Prefix'})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#   PrefixA  PrefixB  AnotherColumn  combined.PrefixA  combined.PrefixB
#0      2.5     0.75            NaN               2.5              0.75
#1      NaN     0.90            2.5               NaN              0.90
#2      2.1     0.80            2.6               2.1              0.80
#3      1.0     0.76            2.4               1.0              0.76