def test_same_schema_with_dataframe_input(self):
        train_df_updated = train_df.drop(['c0'], axis=1)
        test_df_updated = test_df.drop(['c0'], axis=1)

        rf_max = 4.5

        # Create reference pipeline
        std_pipeline = Pipeline([
            RangeFilter(min=0.0, max=rf_max) << 'c2',
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df_updated)
        result_1 = std_pipeline.predict(test_df_updated)

        # Create combined pipeline
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2'])
        transform_pipeline.fit(train_df_updated)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df_updated)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_df_updated)

        self.assertTrue(result_1.equals(result_2))
    def test_different_schema_with_dataframe_input(self):
        # Create reference pipeline
        std_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df)
        result_1 = std_pipeline.predict(test_df)

        # Create combined pipeline
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
    def test_combining_two_dataset_transformers(self):
        rf_max = 4.5

        # Create reference pipeline
        std_pipeline = Pipeline([
            RangeFilter(min=0.0, max=rf_max) << 'c2',
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df)
        result_1 = std_pipeline.predict(test_df)

        # Create combined pipeline
        transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2'])
        transform_pipeline1.fit(train_df)

        transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline2.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline1.model),
            DatasetTransformer(transform_model=transform_pipeline2.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        os.remove(transform_pipeline1.model)
        os.remove(transform_pipeline2.model)

        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
    def test_notvectorized_output_predictor_model(self):
        """
        This test verifies that outputted predictor model from 
        combined (with featurizers) pipeline runs successfully
        on featurized data with no vectors.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df1 = transform_pipeline.transform(df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            RangeFilter(min=0.0, max=4.5) << 'c2',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_2 = predictor_pipeline.predict(df1)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#5
0
    def test_predict(self):
        transformed_data, transformed_data_df = transform_data()
        fl = FastLinearRegressor(feature=['parity', 'in', 'sp', 'stratum'],
                                 label='age')
        flpipe = Pipeline([fl])
        flpipe.fit(transformed_data)
        scores = flpipe.predict(transformed_data)
        scores_df = flpipe.predict(transformed_data_df)

        assert_array_equal(scores, scores_df)
示例#6
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[
                ('cat',
                 OneHotVectorizer() << categorical_columns),
                ('linear',
                 FastLinearBinaryClassifier(
                     shuffle=False,
                     number_of_threads=1))])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)

        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        os.remove(pickle_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(
            metrics.sum().sum(),
            metrics_pickle.sum().sum(),
            decimal=2)

        # Save load with pipeline methods
        model_filename = get_temp_file(suffix='.m')
        model_nimbusml.save_model(model_filename)
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model(model_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(
            metrics.sum().sum(),
            model_nimbusml_load.sum().sum(),
            decimal=2)

        os.remove(model_filename)
示例#7
0
    def test_syntax_concat_slots(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            Concat() << {'newcol': ['workclass', 'education']},
        ])
        exp.fit(X, verbose=0)
        exp.predict(X)
    def test_pipeline_subclass_can_override_predict(self):
        X, y = generate_dataset_1()

        pipeline = Pipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X, y)
        result = pipeline.predict(X)['PredictedLabel']

        self.assertTrue(np.array_equal(result.values, y['y'].values))

        pipeline = CustomPipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X, y)

        self.assertEqual(pipeline.predict(X, test_return_value=3), 3)
示例#9
0
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features']
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#10
0
    def test_syntax6_regular_expression(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << '~Features',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#11
0
    def test_ensemble_supports_user_defined_transforms(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}))

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test2_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test2_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test2_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([
            RangeFilter(min=0, max=10, columns='c1'),
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test2_df)

        self.assertEqual(len(result4), 3)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        average3 = (result1[2] + result2[2] + result3[2]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)
        self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
示例#12
0
    def test_syntax8_label(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)

        exp = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {'Feature': ['workclass', 'education'],
                                      Role.Label: 'new_y'}
        ])
        exp.fit(df, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Features'
        assert exp.nodes[-1].label_column_ == 'new_y'
        # The pipeline requires it now as it is transformed all along.
        X['yy'] = 0.0
        prediction = exp.predict(X, verbose=0)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 0.4:
            raise Exception(prediction)
        if prediction['Score'].max() > 2.00:
            raise Exception(prediction)
示例#13
0
    def test_syntax12_mixed2(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline(
            [
                OneHotVectorizer(
                    columns=[
                        'workclass', 'education']),
                Concat(
                    columns={
                        'Feature': ['workclass', 'education']}),
                FastTreesRegressor(
                    num_trees=5, feature='Feature', weight='weight') << {
                    Role.Label: 'y'}])
        exp.fit(X, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Feature'
        assert exp.nodes[-1].label_column_ == 'y'
        assert exp.nodes[-1].weight_column_ == 'weight'
        # y is required here as well as weight.
        # It is replaced by fakes values.
        # The test does not fail but the weight is not taken into account.
        X['y'] = -5
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
示例#14
0
def nimbus_pred(model_path, test_set_path):
    X = pd.read_csv(test_set_path)
    X['c'] = X['c'].astype("category")
    p = Pipeline()
    p.load_model(model_path)
    pred = p.predict(X)
    print(pred)
示例#15
0
    def test_syntax10_weights(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   weight=[1., 1., 1., 2., 1.],
                                   y=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop(['y', 'weight'], axis=1)
        y = df['y']
        w = df['weight']

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            FastLinearRegressor()
        ])
        exp.fit(X, y, weight=w, verbose=0)
        assert exp.nodes[-1].feature_column == 'Features'
        assert exp.nodes[-1].label_column == 'y'
        assert exp.nodes[-1].weight_column == 'weight'
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 1.:
            raise Exception(prediction)
        if prediction['Score'].max() > 3.6:
            raise Exception(prediction)
        if len(set(prediction['Score'])) < 4:
            raise Exception(prediction)
示例#16
0
    def test_syntax3(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << 'education',
            OneHotVectorizer(max_num_terms=2) << 'workclass',
            # Currently the learner does not use edu1
            # unless it is specified explicitely so nimbusml
            # does not do what the syntax implicetely tells.
            # We need to modify either the bridge to look into
            # every available column at one step.
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#17
0
    def test_syntax11_learner(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            FastLinearBinaryClassifier(max_iterations=1) << {
                'Features': ['edu1', 'edu2'],
                Role.Label: 'y'
            }
        ])
        exp.fit(df)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#18
0
    def test_syntax4_dict(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            Concat() << {
                'Inputs': ['edu1', 'edu2', 'wki']
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Inputs'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#19
0
    def test_syntax5_regular_expression(self):
        # REVIEW: not implemented yet
        # The best would be to handle regular expression inside nimbusml.
        # It could be handled in entrypoint.py just before calling nimbusml.
        # It can be handled inside Pipeline if it is aware of
        # the input schema.

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': 'f[0-9]+'
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Features'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#20
0
    def test_syntax7_rename(self):
        # Error message are usually not informative enough.
        # Missing column --> no indication of other columns.
        # Error is (one transform should handle it)
        # 'The label column 'y' of the training data has a data type
        # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>.
        # Type must be Bool, R4, R8 or Key with two classes.

        df = pandas.DataFrame(
            dict(
                education=[
                    'A', 'B', 'A', 'B', 'A'], workclass=[
                    'X', 'X', 'Y', 'Y', 'Y'], y=[
                    'red', 'white', 'red', 'white', 'white']))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << 'y',
            OneHotVectorizer() << ['workclass', 'education'],
            TypeConverter(result_type='R4') << {'yi': 'y'},
            Drop() << 'y',
            FastLinearBinaryClassifier(max_iterations=1) << 'yi'
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        assert prediction.min() > 0.01
        assert prediction.max() < 0.05
示例#21
0
    def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(
            self):
        """
        This test verifies that two models can be combined
        even if the transform increases the number of columns.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#22
0
 def test_trees(self):
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier()])
     pipeline.fit(train, label)
     out_data = pipeline.predict(test)
     check_accuracy(test_file, label_column, out_data, 0.65)
示例#23
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[('cat', OneHotVectorizer() << categorical_columns),
                   ('linear',
                    FastLinearBinaryClassifier(shuffle=False, train_threads=1)
                    )])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb'))
        model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb"))

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)

        # Save load with pipeline methods
        model_nimbusml.save_model('model.nimbusml.m')
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model('model.nimbusml.m')

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            model_nimbusml_load.sum().sum(),
                            decimal=2)
示例#24
0
 def test_trees_file(self):
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier() << {
                              'Label': label_column}])
     train_stream = FileDataStream(train_file, schema=file_schema)
     pipeline.fit(train_stream, label_column)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
示例#25
0
 def test_linear_with_train_test_schema(self):
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastLinearBinaryClassifier(train_threads=1,
                                                     shuffle=False)])
     pipeline.fit(train, label)
     out_data = pipeline.predict(test)
     check_accuracy(test_file, label_column, out_data, 0.65)
示例#26
0
 def test_linear_file_role(self):
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastLinearBinaryClassifier(train_threads=1,
                                                     shuffle=False)])
     train_stream = FileDataStream(train_file, schema=file_schema)
     train_stream._set_role('Label', label_column)
     pipeline.fit(train_stream)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
示例#27
0
    def test_linear_file(self):
        pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                             FastLinearBinaryClassifier(train_threads=1,
                                                        shuffle=False)])

        train_stream = FileDataStream(train_file, schema=file_schema)
        assert 'sep' in train_stream.schema.options
        assert 'header' in train_stream.schema.options
        pipeline.fit(train_stream, label_column)
        test_stream = FileDataStream(test_file, schema=file_schema)
        out_data = pipeline.predict(test_stream)
        check_accuracy(test_file, label_column, out_data, 0.65)
示例#28
0
    def test_syntax11_append_insert(self):

        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline()
        exp.append(
            ("OneHotHashVectorizer",
             OneHotHashVectorizer() << {
                 'edu2': 'education'}))
        exp.insert(0, OneHotVectorizer() << {'edu1': 'education'})
        exp.append(
            FastLinearBinaryClassifier(
                maximum_number_of_iterations=1) << {
                'Features': [
                    'edu1',
                    'edu2'],
                Role.Label: 'y'})
        exp.append(OneHotHashVectorizer() << {'edu2': 'education'})
        del exp[-1]
        assert len(exp) == 3

        exp.fit(df, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)

        try:
            exp.append(OneHotHashVectorizer() << {'edu2': 'education'})
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)
        try:
            exp.insert(0, OneHotHashVectorizer() << {'edu2': 'education'})
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)
        try:
            del exp[0]
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)

        obj = exp[1][1]
        assert obj.__class__.__name__ == "OneHotHashVectorizer"
        obj = exp[1][1]
        assert obj.__class__.__name__ == "OneHotHashVectorizer"
        res = exp['OneHotHashVectorizer']
        assert len(res) == 1
        graph = exp.graph_
        assert len(graph.nodes) >= len(exp)
示例#29
0
    def test_ensemble_supports_output_predictor_model(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}),
                                   ignore_index=True)
        test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32})

        # Create a ground truth pipeline
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df)
        result_1 = combined_pipeline.predict(test2_df)

        # Create a duplicate pipeline but also request a predictor model
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_2 = combined_pipeline.predict(test2_df)

        # Create a predictor model only pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_3 = predictor_pipeline.predict(test2_df)

        # Verify the first rows are equal
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score'])

        # Verify the second rows are equal
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
        self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score'])

        # Verify the number of rows
        self.assertEqual(len(result_1), 2)
        self.assertEqual(len(result_2), 2)
        self.assertEqual(len(result_3), 4)
示例#30
0
    def test_ensemble_with_average_and_median_combiner(self):
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test_df)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test_df)

        median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1]
        median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1]

        self.assertEqual(median1, result4.loc[0, 'Score'])
        self.assertEqual(median2, result4.loc[1, 'Score'])