예제 #1
0
    def test_same_schema_with_dataframe_input(self):
        train_df_updated = train_df.drop(['c0'], axis=1)
        test_df_updated = test_df.drop(['c0'], axis=1)

        rf_max = 4.5

        # Create reference pipeline
        std_pipeline = Pipeline([
            RangeFilter(min=0.0, max=rf_max) << 'c2',
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df_updated)
        result_1 = std_pipeline.predict(test_df_updated)

        # Create combined pipeline
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2'])
        transform_pipeline.fit(train_df_updated)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df_updated)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_df_updated)

        self.assertTrue(result_1.equals(result_2))
예제 #2
0
    def test_combining_two_dataset_transformers(self):
        rf_max = 4.5

        # Create reference pipeline
        std_pipeline = Pipeline([
            RangeFilter(min=0.0, max=rf_max) << 'c2',
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df)
        result_1 = std_pipeline.predict(test_df)

        # Create combined pipeline
        transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2'])
        transform_pipeline1.fit(train_df)

        transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline2.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline1.model),
            DatasetTransformer(transform_model=transform_pipeline2.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        os.remove(transform_pipeline1.model)
        os.remove(transform_pipeline2.model)

        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
예제 #3
0
    def test_notvectorized_output_predictor_model(self):
        """
        This test verifies that outputted predictor model from 
        combined (with featurizers) pipeline runs successfully
        on featurized data with no vectors.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df1 = transform_pipeline.transform(df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            RangeFilter(min=0.0, max=4.5) << 'c2',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_2 = predictor_pipeline.predict(df1)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
예제 #4
0
    def test_combine_transform_and_transform(self):
        transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2'
        df = transform_1.fit_transform(train_df)

        transform_2 = OneHotVectorizer() << 'c0'
        transform_2.fit(df)

        df = transform_1.transform(test_df)
        result_1 = transform_2.transform(df)

        combined_pipeline = Pipeline.combine_models(transform_1,
                                                    transform_2,
                                                    contains_predictor=False)
        result_2 = combined_pipeline.transform(test_df)

        self.assertTrue(result_1.equals(result_2))
예제 #5
0
    def test_ensemble_supports_user_defined_transforms(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}))

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test2_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test2_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test2_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([
            RangeFilter(min=0, max=10, columns='c1'),
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test2_df)

        self.assertEqual(len(result4), 3)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        average3 = (result1[2] + result2[2] + result3[2]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)
        self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
예제 #6
0
    def test_combine_transform_and_pipeline(self):
        transform = RangeFilter(min=0.0, max=4.5) << 'c2'
        df = transform.fit_transform(train_df, as_binary_data_stream=True)

        pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ])
        pipeline.fit(df)

        df = transform.transform(test_df, as_binary_data_stream=True)
        result_1 = pipeline.predict(df)

        combined_pipeline = Pipeline.combine_models(transform, pipeline)
        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
예제 #7
0
    def test_get_fit_info(self):
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'])
        transform_pipeline.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        info = combined_pipeline.get_fit_info(train_df)

        self.assertTrue(info[0][1]['name'] == 'DatasetTransformer')
예제 #8
0
    def test_ensemble_supports_output_predictor_model(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}),
                                   ignore_index=True)
        test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32})

        # Create a ground truth pipeline
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df)
        result_1 = combined_pipeline.predict(test2_df)

        # Create a duplicate pipeline but also request a predictor model
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_2 = combined_pipeline.predict(test2_df)

        # Create a predictor model only pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_3 = predictor_pipeline.predict(test2_df)

        # Verify the first rows are equal
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score'])

        # Verify the second rows are equal
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
        self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score'])

        # Verify the number of rows
        self.assertEqual(len(result_1), 2)
        self.assertEqual(len(result_2), 2)
        self.assertEqual(len(result_3), 4)
예제 #9
0
    def test_get_schema_returns_correct_value_for_single_valued_columns(self):
        df = train_df.drop(['c0'], axis=1)

        pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'])
        pipeline.fit(df)
        df = pipeline.transform(df)

        schema = pipeline.get_output_columns()

        self.assertTrue('c1' in schema)
        self.assertTrue('c2' in schema)

        self.assertEqual(len(schema), 2)
예제 #10
0
    def test_three_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(
            self):
        """
        This test verifies that three models can be combined
        even if the transform increases the number of columns.
        """
        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline_1 = Pipeline(
            [RangeFilter(min=0.0, max=4.5) << 'c2'])
        df = transform_pipeline_1.fit_transform(train_df,
                                                as_binary_data_stream=True)

        # Create and fit a OneHotVectorizer transform using
        # the transformed data from the previous step and use it
        # to transform the data from the previous step.
        transform_pipeline_2 = Pipeline([OneHotVectorizer() << 'c0'],
                                        random_state=seed)
        transform_pipeline_2.fit(df)
        df = transform_pipeline_2.transform(df, as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transforms and predictor defined previously.
        df = transform_pipeline_1.transform(test_df,
                                            as_binary_data_stream=True)
        df = transform_pipeline_2.transform(df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline_1,
                                                    transform_pipeline_2,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
예제 #11
0
    def test_reange_filter_short_example(self):
        d = pd.DataFrame([[1., 1.9, 3.], [2., 3., 4.], [2., 3., 4.]])
        d.columns = ['aa', 'bb', 'cc']

        hdl = RangeFilter(min=0, max=2) << 'aa'
        res1 = hdl.fit_transform(d)
        assert res1 is not None
        assert res1.shape == (1, 3)

        hdl = RangeFilter(min=0, max=2) << 'bb'
        res2 = hdl.fit_transform(d)
        assert res2 is not None
        assert res2.shape == (1, 3)
        assert res1.values.ravel().tolist() == res2.values.ravel().tolist()
예제 #12
0
    def test_two_pipelines_created_using_dataframes_can_be_combined_when_the_schemas_are_the_same(
            self):
        """
        This test verifies that two models created using DataFrames
        can be combined if the output schema of the first is the same
        as the input schema of the second.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df = transform_pipeline.transform(df)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2')], random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df)
        result_1 = predictor_pipeline.predict(df)

        df = test_df.drop(['c0'], axis=1)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
예제 #13
0
###############################################################################
# RangeFilter
import numpy as np
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.filter import RangeFilter

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, numeric_dtype=np.float32)
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...

# transform usage
xf = RangeFilter(min=20, max=30, columns='age')

# fit and transform, rows with age outside the range will be deleted
features = xf.fit_transform(data)
print(features.head())
#    age  case education  id  induced  parity  pooled.stratum  ...
# 0  26.0     1    0-5yrs   1        1       6               3 ...
# 1  23.0     1   6-11yrs   7        0       1               6 ...
# 2  21.0     1   6-11yrs   9        0       1               5 ...
# 3  28.0     1   6-11yrs  10        0       2              19 ...
# 4  29.0     1   6-11yrs  11        1       2              20 ...
예제 #14
0
    'Resizer': Pipeline([
        Loader(columns={'ImgPath': 'Path'}),
        Resizer(image_width=227, image_height=227,
                columns={'ImgResize': 'ImgPath'})
    ]),
    'SkipFilter': SkipFilter(count=5),
    'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'],
                                         seasonal_window_size=2),
    'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'],
                                                    seasonal_window_size=2),
    'SsaForecaster': SsaForecaster(columns=['Sepal_Length'],
                                   window_size=2,
                                   series_length=5,
                                   train_size=5,
                                   horizon=1),
    'RangeFilter': RangeFilter(min=5.0, max=5.1, columns=['Sepal_Length']),
    'TakeFilter': TakeFilter(count=100),
    'TensorFlowScorer': TensorFlowScorer(
        model_location=os.path.join(
            script_dir,
            '..',
            'nimbusml',
            'examples',
            'frozen_saved_model.pb'),
        columns={'c': ['a', 'b']}),
    'ToKey': ToKey(columns={'edu_1': 'education_str'}),
    'TypeConverter': TypeConverter(columns=['group'], result_type='R4'),
    'WordTokenizer': WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'}
}

DATASETS = {
예제 #15
0
###############################################################################
# RangeFilter
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.filter import RangeFilter
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

# select rows where  5.0 <= Sepal_Length <= 5.1
filter = RangeFilter(min=5.0, max=5.1) << 'Sepal_Length'
print(filter.fit_transform(X_train))

# select rows where Sepal_Length <= 4.5 or Sepal_Length >= 7.5
filter = RangeFilter(min=4.5, max=7.5, complement=True) << 'Sepal_Length'