示例#1
0
    def test_syntax11_learner(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            FastLinearBinaryClassifier(max_iterations=1) << {
                'Features': ['edu1', 'edu2'],
                Role.Label: 'y'
            }
        ])
        exp.fit(df)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#2
0
    def test_syntax5_regular_expression(self):
        # REVIEW: not implemented yet
        # The best would be to handle regular expression inside nimbusml.
        # It could be handled in entrypoint.py just before calling nimbusml.
        # It can be handled inside Pipeline if it is aware of
        # the input schema.

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': 'f[0-9]+'
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Features'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#3
0
    def test_syntax6_regular_expression(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << '~Features',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#4
0
    def test_syntax4_fail(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            FastLinearBinaryClassifier(max_iterations=1) <<
            ['edu1', 'edu2', 'wki']
        ])
        try:
            exp.fit(X, y)
            assert False
        except RuntimeError as e:
            assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \
                   in str(e)
示例#5
0
    def test_syntax4_fail2(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            FastLinearBinaryClassifier(max_iterations=1) <<
            ['edu1', 'edu4', 'wki']
        ])
        try:
            exp.fit(X, y)
            raise AssertionError("The test should not reach this line.")
        except Exception as e:
            assert "Feature column 'edu4' not found" in str(e)
示例#6
0
    def test_syntax4_dict(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            Concat() << {
                'Inputs': ['edu1', 'edu2', 'wki']
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Inputs'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#7
0
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features']
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#8
0
    def test_ensemble_supports_user_defined_transforms(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}))

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test2_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test2_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test2_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([
            RangeFilter(min=0, max=10, columns='c1'),
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test2_df)

        self.assertEqual(len(result4), 3)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        average3 = (result1[2] + result2[2] + result3[2]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)
        self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
示例#9
0
    def test_syntax3(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << 'education',
            OneHotVectorizer(max_num_terms=2) << 'workclass',
            # Currently the learner does not use edu1
            # unless it is specified explicitely so nimbusml
            # does not do what the syntax implicetely tells.
            # We need to modify either the bridge to look into
            # every available column at one step.
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#10
0
    def test_syntax10_weights(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   weight=[1., 1., 1., 2., 1.],
                                   y=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop(['y', 'weight'], axis=1)
        y = df['y']
        w = df['weight']

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            FastLinearRegressor()
        ])
        exp.fit(X, y, weight=w, verbose=0)
        assert exp.nodes[-1].feature_column == 'Features'
        assert exp.nodes[-1].label_column == 'y'
        assert exp.nodes[-1].weight_column == 'weight'
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 1.:
            raise Exception(prediction)
        if prediction['Score'].max() > 3.6:
            raise Exception(prediction)
        if len(set(prediction['Score'])) < 4:
            raise Exception(prediction)
示例#11
0
    def test_test(self):
        transformed_data, transformed_data_df = transform_data()
        fl = FastLinearRegressor(
            feature=[
                'parity',
                'in',
                'sp',
                'stratum'],
            label='age')
        flpipe = Pipeline([fl])
        flpipe.fit(transformed_data)
        metrics, scores = flpipe.test(transformed_data, output_scores=True)
        metrics_df, scores_df = flpipe.test(
            transformed_data_df, output_scores=True)

        assert_array_equal(scores, scores_df)
        assert_array_equal(metrics, metrics_df)

        flpipe.fit(
            transformed_data_df.drop(
                'age',
                axis=1),
            transformed_data_df['age'])
        metrics, scores = flpipe.test(transformed_data, output_scores=True)
        metrics_df, scores_df = flpipe.test(
            transformed_data_df, output_scores=True)

        assert_array_equal(scores, scores_df)
        assert_array_equal(metrics, metrics_df)
示例#12
0
    def test_metrics_evaluate_clusterer(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = KMeansPlusPlus(n_clusters=2, initialization_algorithm="Random")
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # if abs(metrics['NMI'][0] - 0.7) >= 0.15:
        #    raise AssertionError("NMI loss should be %f not %f" % \
        # (0.7, metrics['NMI'][0]))
        # if abs(metrics['AvgMinScore'][0] - 0.014) >= 0.015:
        #    raise AssertionError("AvgMinScore  should be %f not %f" % (\
        # 0.014, metrics['AvgMinScore'][0]))
        assert_almost_equal(metrics['NMI'][0],
                            0.7,
                            decimal=0,
                            err_msg="NMI loss should be %s" % 0.7)
        assert_almost_equal(metrics['AvgMinScore'][0],
                            0.032,
                            decimal=2,
                            err_msg="AvgMinScore  should be %s" % 0.014)
示例#13
0
    def test_pipeline_loaded_from_zip_has_feature_contributions(self):
        features = ['age', 'education-num', 'hours-per-week']
        
        model_nimbusml = Pipeline(
            steps=[FastLinearBinaryClassifier(feature=features)])
        model_nimbusml.fit(train, label)
        fc = model_nimbusml.get_feature_contributions(test)

        # Save the model to zip
        model_filename = get_temp_file(suffix='.zip')
        model_nimbusml.save_model(model_filename)
        # Load the model from zip
        model_nimbusml_zip = Pipeline()
        model_nimbusml_zip.load_model(model_filename)

        fc_zip = model_nimbusml_zip.get_feature_contributions(test)
        
        assert ['FeatureContributions.' + feature in fc_zip.columns
                for feature in features]

        assert [fc['FeatureContributions.' + feature].equals(
            fc_zip['FeatureContributions.' + feature])
                for feature in features]

        os.remove(model_filename)
示例#14
0
    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
示例#15
0
 def test_model_summary(self):
     for learner in learners:
         pipeline = Pipeline(
             [OneHotVectorizer() << categorical_columns, learner])
         train_stream = FileDataStream(train_file, schema=file_schema)
         pipeline.fit(train_stream, label_column)
         pipeline.summary()
示例#16
0
    def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(
            self):
        """
        This test verifies that two models can be combined
        even if the transform increases the number of columns.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#17
0
    def test_experiment_loadsavemodel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        (test, label1) = get_X_y(test_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])
        pipeline.fit(train, label)
        metrics1, scores1 = pipeline.test(test,
                                          label1,
                                          'binary',
                                          output_scores=True)
        sum1 = metrics1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        metrics2, scores2 = pipeline2.test(test,
                                           label1,
                                           'binary',
                                           output_scores=True)
        sum2 = metrics2.sum().sum()

        assert_equal(sum1, sum2,
                     "model metrics don't match after loading model")
    def test_notvectorized_output_predictor_model(self):
        """
        This test verifies that outputted predictor model from 
        combined (with featurizers) pipeline runs successfully
        on featurized data with no vectors.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df1 = transform_pipeline.transform(df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            RangeFilter(min=0.0, max=4.5) << 'c2',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_2 = predictor_pipeline.predict(df1)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#19
0
 def test_model_summary_not_supported(self):
     for learner in learners_not_supported:
         pipeline = Pipeline(
             [OneHotVectorizer() << categorical_columns, learner])
         train_stream = FileDataStream(train_file, schema=file_schema)
         pipeline.fit(train_stream, label_column)
         assert_raises(TypeError, pipeline.summary)
示例#20
0
    def test_unpickled_pipeline_has_feature_contributions(self):
        features = ['age', 'education-num', 'hours-per-week']
        
        model_nimbusml = Pipeline(
            steps=[FastLinearBinaryClassifier(feature=features)])
        model_nimbusml.fit(train, label)
        fc = model_nimbusml.get_feature_contributions(test)

        # Save with pickle
        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)
        # Unpickle model
        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        fc_pickle = model_nimbusml_pickle.get_feature_contributions(test)

        assert ['FeatureContributions.' + feature in fc_pickle.columns
                for feature in features]

        assert [fc['FeatureContributions.' + feature].equals(
            fc_pickle['FeatureContributions.' + feature])
                for feature in features]

        os.remove(pickle_filename)
示例#21
0
    def test_metrics_evaluate_regressor(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = FastTreesRegressor()
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['L1(avg)'][0],
                            0.107,
                            decimal=1,
                            err_msg="L1 loss should be %s" % 0.107)
        assert_almost_equal(metrics['L2(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="L2(avg) should be %s" % 0.0453)
        assert_almost_equal(metrics['Loss-fn(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
示例#22
0
    def test_syntax8_label(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)

        exp = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {'Feature': ['workclass', 'education'],
                                      Role.Label: 'new_y'}
        ])
        exp.fit(df, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Features'
        assert exp.nodes[-1].label_column_ == 'new_y'
        # The pipeline requires it now as it is transformed all along.
        X['yy'] = 0.0
        prediction = exp.predict(X, verbose=0)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 0.4:
            raise Exception(prediction)
        if prediction['Score'].max() > 2.00:
            raise Exception(prediction)
示例#23
0
    def test_syntax7_rename(self):
        # Error message are usually not informative enough.
        # Missing column --> no indication of other columns.
        # Error is (one transform should handle it)
        # 'The label column 'y' of the training data has a data type
        # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>.
        # Type must be Bool, R4, R8 or Key with two classes.

        df = pandas.DataFrame(
            dict(
                education=[
                    'A', 'B', 'A', 'B', 'A'], workclass=[
                    'X', 'X', 'Y', 'Y', 'Y'], y=[
                    'red', 'white', 'red', 'white', 'white']))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << 'y',
            OneHotVectorizer() << ['workclass', 'education'],
            TypeConverter(result_type='R4') << {'yi': 'y'},
            Drop() << 'y',
            FastLinearBinaryClassifier(max_iterations=1) << 'yi'
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        assert prediction.min() > 0.01
        assert prediction.max() < 0.05
示例#24
0
    def test_syntax12_mixed2(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline(
            [
                OneHotVectorizer(
                    columns=[
                        'workclass', 'education']),
                Concat(
                    columns={
                        'Feature': ['workclass', 'education']}),
                FastTreesRegressor(
                    num_trees=5, feature='Feature', weight='weight') << {
                    Role.Label: 'y'}])
        exp.fit(X, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Feature'
        assert exp.nodes[-1].label_column_ == 'y'
        assert exp.nodes[-1].weight_column_ == 'weight'
        # y is required here as well as weight.
        # It is replaced by fakes values.
        # The test does not fail but the weight is not taken into account.
        X['y'] = -5
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
示例#25
0
    def test_pipeline_with_no_columns(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None
        ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
示例#26
0
    def test_metrics_evaluate_binary(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['AUC'][0],
                            0.980,
                            decimal=1,
                            err_msg="AUC should be %s" % 0.980)
        assert_almost_equal(metrics['Accuracy'][0],
                            0.632,
                            decimal=1,
                            err_msg="Accuracy should be %s" % 0.632)
        assert_almost_equal(metrics['Positive precision'][0],
                            1,
                            decimal=1,
                            err_msg="Positive precision should be %s" % 1)
        assert_almost_equal(metrics['Positive recall'][0],
                            0.125,
                            decimal=1,
                            err_msg="Positive recall should be %s" % 0.125)
        assert_almost_equal(metrics['Negative precision'][0],
                            0.611,
                            decimal=1,
                            err_msg="Negative precision should be %s" % 0.611)
        assert_almost_equal(metrics['Negative recall'][0],
                            1,
                            decimal=1,
                            err_msg="Negative recall should be %s" % 1)
        assert_almost_equal(metrics['Log-loss'][0],
                            0.686,
                            decimal=1,
                            err_msg="Log-loss should be %s" % 0.686)
        assert_almost_equal(metrics['Log-loss reduction'][0],
                            0.3005,
                            decimal=3,
                            err_msg="Log-loss reduction should be %s" % 0.3005)
        assert_almost_equal(
            metrics['Test-set entropy (prior Log-Loss/instance)'][0],
            0.981,
            decimal=1,
            err_msg="Test-set entropy (prior Log-Loss/instance) should be %s" %
            0.981)
        assert_almost_equal(metrics['F1 Score'][0],
                            0.222,
                            decimal=1,
                            err_msg="F1 Score should be %s" % 0.222)
        assert_almost_equal(metrics['AUPRC'][0],
                            0.966,
                            decimal=1,
                            err_msg="AUPRC should be %s" % 0.966)
    def _test_schema_syntax_shift_df(self):
        df = pandas.DataFrame(
            data=dict(X1=[0.1, 0.2], X2=[0.1, 0.2], yl=[1, 0], tx=['e', 'r']))

        exp = Pipeline(
            [OneHotVectorizer() << 'tx',
             FastLinearBinaryClassifier()])
        exp.fit(df, 'yl')
示例#28
0
 def test_trees(self):
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier()])
     pipeline.fit(train, label)
     out_data = pipeline.predict(test)
     check_accuracy(test_file, label_column, out_data, 0.65)
    def test_default_label(self):
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]

        # 1
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Label: 'Label',
                Role.Feature: 'Features'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities0 = model.predict_proba(df)

        # 2
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Feature: 'Features'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

        # 3
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2)
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

        # 4
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Label: 'Label'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)
示例#30
0
 def test_trees_file(self):
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier() << {
                              'Label': label_column}])
     train_stream = FileDataStream(train_file, schema=file_schema)
     pipeline.fit(train_stream, label_column)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)