예제 #1
0
    def test_learners_sweep(self):
        # grid search over 2 learners, even though pipe defined with
        # FastTreesBinaryClassifier
        # FastLinearBinaryClassifier learner wins, meaning we grid searched
        # over it
        np.random.seed(0)

        df = pd.DataFrame(
            dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'],
                 workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
                 y=[1, 0, 1, 1, 0, 1, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        cat = OneHotVectorizer() << ['education', 'workclass']
        learner = FastTreesBinaryClassifier()
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner=[
            FastLinearBinaryClassifier(),
            FastTreesBinaryClassifier()
        ],
                          learner__number_of_threads=[1, 4])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X, y)
        assert grid.best_params_[
            'learner'].__class__.__name__ == 'FastLinearBinaryClassifier'
        assert grid.best_params_['learner__number_of_threads'] == 1
    def test_default_label(self):
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]

        # 1
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Label: 'Label',
                Role.Feature: 'Features'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities0 = model.predict_proba(df)

        # 2
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Feature: 'Features'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

        # 3
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2)
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

        # 4
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Label: 'Label'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)
예제 #3
0
    def test_hyperparameters_sweep(self):
        # general test with combination of named and unnamed steps
        np.random.seed(0)
        df = pd.DataFrame(
            dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'],
                 workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
                 y=[1, 0, 1, 1, 0, 1, 0]))
        X = df.drop('y', axis=1)
        y = df['y']
        pipe = Pipeline([
            ('cat', OneHotVectorizer() << 'education'),
            # unnamed step, stays same in grid search
            OneHotHashVectorizer() << 'workclass',
            # number_of_trees 0 will actually be never run by grid search
            ('learner',
             FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2))
        ])

        param_grid = dict(cat__output_kind=['Indicator', 'Binary'],
                          learner__number_of_trees=[1, 2, 3])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X, y)
        print(grid.best_params_)
        assert grid.best_params_ == {
            'cat__output_kind': 'Indicator',
            'learner__number_of_trees': 1
        }
예제 #4
0
    def test_experiment_loadsavemodel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        (test, label1) = get_X_y(test_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])
        pipeline.fit(train, label)
        metrics1, scores1 = pipeline.test(test,
                                          label1,
                                          'binary',
                                          output_scores=True)
        sum1 = metrics1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        metrics2, scores2 = pipeline2.test(test,
                                           label1,
                                           'binary',
                                           output_scores=True)
        sum2 = metrics2.sum().sum()

        assert_equal(sum1, sum2,
                     "model metrics don't match after loading model")
예제 #5
0
    def test_ovr_accuracy(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(number_of_threads=1),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1,
                                       number_of_threads=1),
            GamBinaryClassifier(number_of_threads=1),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1,
                                      number_of_threads=1),
            FastLinearBinaryClassifier(number_of_threads=1),
            SgdBinaryClassifier(number_of_threads=1),
            # SymSgdBinaryClassifier(number_of_threads=1),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            metrics = accuracy(ovr)
            accu = metrics['Accuracy(micro-avg)'][0]
            # algos will have wide range of accuracy, so use low bar. Also
            # checks Pipeline + Ova + clf
            assert_greater(
                accu, 0.65,
                "{} accuracy is too low {}".format(clf.__class__, accu))
예제 #6
0
    def test_pickle_pipeline(self):
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        pipe = Pipeline(steps=[("cat", cat), ("ftree", ftree)])
        pipe.fit(X_train, y_train)

        scores = pipe.predict(X_test)
        accu1 = np.mean(y_test.values.ravel() == scores.values)

        # Unpickle model and score. We should get the exact same accuracy as
        # above
        s = pickle.dumps(pipe)
        os.remove(cat.model_)
        os.remove(ftree.model_)
        pipe2 = pickle.loads(s)

        scores2 = pipe2.predict(X_test)
        accu2 = np.mean(y_test.values.ravel() == scores2.values)
        assert_equal(
            accu1,
            accu2,
            "accuracy mismatch after unpickling pipeline")
예제 #7
0
    def test_pickle_pipeline_and_nimbusml_pipeline(self):
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        nimbusmlpipe = nimbusmlPipeline([cat, ftree])
        skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)])
        skpipe.fit(X_train, y_train)

        scores = skpipe.predict(X_test)
        accu1 = np.mean(y_test.values.ravel() == scores["PredictedLabel"].values)

        # Unpickle model and score. We should get the exact same accuracy as
        # above
        s = pickle.dumps(skpipe)
        pipe2 = pickle.loads(s)
        scores2 = pipe2.predict(X_test)
        accu2 = np.mean(y_test.values.ravel() == scores2["PredictedLabel"].values)
        assert_equal(
            accu1,
            accu2,
            "accuracy mismatch after unpickling pipeline")
        assert_frame_equal(scores, scores2)
예제 #8
0
    def test_pipeline_clone(self):
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        nimbusmlpipe = nimbusmlPipeline([cat, ftree])
        skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)])
        skpipe.fit(X_train, y_train)

        scores = skpipe.predict(X_test)

        copy = clone(skpipe)
        scores2 = copy.predict(X_test)
        assert_frame_equal(scores, scores2)

        # checks we can fit again
        skpipe.fit(X_train, y_train)
        scores3 = skpipe.predict(X_test)
        assert_frame_equal(scores, scores3)
예제 #9
0
    def test_clone_sweep(self):
        # grid search, then clone pipeline and grid search again
        # results should be same
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)
        grid.fit(X_train, y_train)

        pipe1 = pipe.clone()
        grid1 = GridSearchCV(pipe1, param_grid)
        grid1.fit(X_train, y_train)

        assert grid.best_params_[
            'learner__number_of_trees'] == grid1.best_params_[
                'learner__number_of_trees']
예제 #10
0
    def test_pipeline_grid_search(self):
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier(number_of_trees=5)
        pipe = Pipeline(
            steps=[
                ("cat", cat), ('pca', PCA(5)), ("ftree", ftree)])

        grid = GridSearchCV(pipe, dict(pca__n_components=[2],
                                       ftree__number_of_trees=[11]))
        grid.fit(X_train, y_train)
        assert grid.best_params_ == {
            'ftree__number_of_trees': 11,
            'pca__n_components': 2}
        steps = grid.best_estimator_.steps
        ft = steps[-1][1]
        number_of_trees = ft.number_of_trees
        assert number_of_trees == 11
예제 #11
0
 def test_trees(self):
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier()])
     pipeline.fit(train, label)
     out_data = pipeline.predict(test)
     check_accuracy(test_file, label_column, out_data, 0.65)
예제 #12
0
 def test_trees_file(self):
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier() << {
                              'Label': label_column}])
     train_stream = FileDataStream(train_file, schema=file_schema)
     pipeline.fit(train_stream, label_column)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
예제 #13
0
    def test_parallel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])

        result = pipeline.fit(train, label, parallel=8)
        result2 = pipeline.fit(train, label, parallel=1)
        assert_true(result == result2)
예제 #14
0
 def test_trees(self):
     np.random.seed(0)
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipe = Pipeline(steps=[('cat',
                             OneHotVectorizer() << categorical_columns
                             ), ('linear', FastTreesBinaryClassifier())])
     pipe.fit(train, label)
     out_data = pipe.predict(test)
     check_accuracy_scikit(test_file, label_column, out_data, 0.77)
예제 #15
0
    def test_learners_sweep(self):
        # grid search over 2 learners
        np.random.seed(0)
        df = pd.DataFrame(
            dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'],
                 workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
                 y=[1, 0, 1, 1, 0, 1, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        cat = OneHotVectorizer() << ['education', 'workclass']
        learner = FastTreesBinaryClassifier()
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner=[
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier()
        ])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X, y)
        assert grid.best_params_[
            'learner'].__class__.__name__ == 'AveragedPerceptronBinaryClassifier'
예제 #16
0
    def test_error_conditions(self):
        # grid search on a wrong param
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',', encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',', encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__wrong_arg=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        assert_raises(ValueError, grid.fit, X_train, y_train)
예제 #17
0
    def test_failing_decision_function_called_with_use_probabilites_true(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(min_split=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(min_split=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            check_decision_function_when_trained_with_use_probabilites_true(
                self, ovr, clf)
예제 #18
0
    def test_failing_predict_proba_called_with_use_probabilites_false(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            check_predict_proba_when_trained_with_use_probabilites_false(
                self, ovr, clf)
예제 #19
0
    def test_uciadult_sweep(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        # number_of_trees 100 will actually be never run by grid search
        # as its not in param_grid below
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X_train, y_train)
        assert grid.best_params_['learner__number_of_trees'] == 10

        # compare AUC on number_of_trees 1, 5, 10
        pipe.set_params(learner__number_of_trees=1)
        pipe.fit(X_train, y_train)
        metrics1, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=5)
        pipe.fit(X_train, y_train)
        metrics5, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=10)
        pipe.fit(X_train, y_train)
        metrics10, _ = pipe.test(X_train, y_train)

        assert metrics10['AUC'][0] > metrics5['AUC'][0]
        assert metrics10['AUC'][0] > metrics1['AUC'][0]
        assert metrics10['AUC'][0] > 0.59
예제 #20
0
    def test_decision_function_produces_distribution_not_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            scoremean = decfun_average(ovr)
            assert_not_equal(
                scoremean, 1.0,
                '{} raw scores should not sum to 1.0 over 3 classes'.format(
                    clf.__class__))
예제 #21
0
    def test_pipeline_get_params(self):

        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        nimbusmlpipe = nimbusmlPipeline([cat, ftree])
        skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)])
        skpipe.fit(X_train, y_train)
        pars = skpipe.get_params(deep=True)
        assert 'steps' in pars
        step = pars['steps'][0]
        assert len(step) == 2
        assert 'nimbusml' in pars
        assert 'nimbusml__random_state' in pars
        assert 'nimbusml__steps' in pars
예제 #22
0
    def test_pickle_predictor(self):
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   features=selected_features)

        ftree = FastTreesBinaryClassifier().fit(X_train, y_train)
        scores = ftree.predict(X_test)
        accu1 = np.mean(y_test.values.ravel() == scores.values)

        # Unpickle model and score. We should get the exact same accuracy as
        # above
        s = pickle.dumps(ftree)
        ftree2 = pickle.loads(s)
        scores2 = ftree2.predict(X_test)
        accu2 = np.mean(y_test.values.ravel() == scores2.values)
        assert_equal(accu1, accu2,
                     "accuracy mismatch after unpickling predictor")
예제 #23
0
    def test_predict_proba_produces_distribution_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # TODO: why symsgd does not sum to 1.0
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf)
            probmean = proba_average(ovr)
            assert_equal(
                probmean, 1.0,
                '{} probabilites {} do not sum to 1.0 over 3 classes'.format(
                    clf.__class__, probmean))
예제 #24
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    FastTreesBinaryClassifier(feature=['age', 'edu'], label='case')
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel  Probability     Score
# 0               0     0.330738 -1.762120
# 1               0     0.337897 -1.681700
# 2               0     0.334428 -1.720559
# 3               0     0.331255 -1.756292
# 4               0     0.333299 -1.733252

# print evaluation metrics
예제 #25
0
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    OneVsRestClassifier(
        # using a binary classifier + OVR for multiclass dataset
        FastTreesBinaryClassifier(),
        # True = class probabilities will sum to 1.0
        # False = raw scores, unknown range
        use_probabilities=True,
        feature=['age', 'edu'],
        label='induced')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2
# 0               2  0.084504  0.302600  0.612897
예제 #26
0
    def test_pipeline_exports_complex(self):

        name = "test_pipeline_exports_complex.csv"
        with open(name, "w") as f:
            f.write(_sentiments)

        transform_1 = NGramFeaturizer() << {'transformed1': 'SentimentText'}
        transform_2 = OneHotVectorizer() << 'SentimentSource'
        transform_3 = ColumnConcatenator() << {
            'finalfeatures': ['transformed1', 'SentimentSource']
        }
        algo = FastTreesBinaryClassifier() << {
            Role.Feature: 'finalfeatures',
            Role.Label: "Positive"
        }

        exp = Pipeline([transform_1, transform_2, transform_3, algo])

        stream = FileDataStream.read_csv(name, sep="\t")
        res = dot_export_pipeline(exp, stream).strip("\n\r ")
        exp = """
                digraph{
                  orientation=portrait;
                  sch0[label="<f0> ItemID|<f1> Sentiment|<f2> \
SentimentSource|<f3> SentimentText|<f4> RowNum|<f5> \
Positive|<f6> Train|<f7> Small",shape=record,fontsize=8];

                  node1[label="NGramFeaturizer",shape=box,style="filled,\
rounded",color=cyan,fontsize=12];
                  sch0:f3 -> node1;
                  sch1[label="<f0> transformed1|<f1> \
transformed1_TransformedText",shape=record,fontsize=8];
                  node1 -> sch1:f0;
                  node1 -> sch1:f1;

                  node2[label="OneHotVectorizer",shape=box,\
style="filled,rounded",color=cyan,fontsize=12];
                  sch0:f2 -> node2;
                  sch2[label="<f0> SentimentSource",shape=record,\
fontsize=8];
                  node2 -> sch2:f0;

                  node3[label="ColumnConcatenator",shape=box,\
style="filled,rounded",color=cyan,fontsize=12];
                  sch1:f0 -> node3;
                  sch2:f0 -> node3;
                  sch3[label="<f0> finalfeatures",shape=record,fontsize=8];
                  node3 -> sch3:f0;

                  node4[label="FastTreesBinaryClassifier",shape=box,\
style="filled,rounded",color=yellow,fontsize=12];
                  sch3:f0 -> node4 [label="Feature",fontsize=8];
                  sch0:f5 -> node4 [label="Label",fontsize=8];
                  sch4[label="<f0> PredictedLabel|<f1> \
PredictedProba|<f2> Score",shape=record,fontsize=8];
                  node4 -> sch4:f0;
                  node4 -> sch4:f1;
                  node4 -> sch4:f2;
                }
                """.replace("                ", "").strip("\n\r ")
        assert res == exp
예제 #27
0
import pandas as pd
from nimbusml import Pipeline
from nimbusml.ensemble import FastTreesBinaryClassifier, GamBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer
from nimbusml.linear_model import FastLinearBinaryClassifier, \
    LogisticRegressionBinaryClassifier
from sklearn.model_selection import GridSearchCV

df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                       workclass=['X', 'X', 'Y', 'Y', 'Y'],
                       y=[1, 0, 1, 0, 0]))
X = df.drop('y', axis=1)
y = df['y']

cat = OneHotHashVectorizer() << ['education', 'workclass']
learner = FastTreesBinaryClassifier()
pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

param_grid = dict(cat__hash_bits=[1, 2, 4, 6, 8, 16],
                  learner=[
                      FastLinearBinaryClassifier(),
                      FastTreesBinaryClassifier(),
                      LogisticRegressionBinaryClassifier(),
                      GamBinaryClassifier()
                  ])
grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn', )

grid.fit(X, y)
print(grid.best_params_['learner'].__class__.__name__)
# FastLinearBinaryClassifier
print(grid.best_params_['cat__hash_bits'])
예제 #28
0
    SgdBinaryClassifier(),
    # Error on linux
    # Unable to load shared library 'SymSgdNative' or one of its dependencies
    #SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor(),
    OneVsRestClassifier(FastLinearBinaryClassifier()),
    GamRegressor(),
    GamBinaryClassifier(),
    PcaAnomalyDetector(),
    FactorizationMachineBinaryClassifier(),
    KMeansPlusPlus(n_clusters=2),
    NaiveBayesClassifier(),
    FastForestBinaryClassifier(number_of_trees=2), 
    FastForestRegressor(number_of_trees=2),
    FastTreesBinaryClassifier(number_of_trees=2),
    FastTreesRegressor(number_of_trees=2),
    FastTreesTweedieRegressor(number_of_trees=2),
    LightGbmRegressor(number_of_iterations=2),
    LightGbmClassifier(),
    LightGbmBinaryClassifier(number_of_iterations=2)
]

learners_not_supported = [
    #PcaTransformer(), # REVIEW: crashes
]


class TestModelSummary(unittest.TestCase):

    def test_model_summary(self):
예제 #29
0
# GridSearchCV with Pipeline: hyperparameter grid search.
import pandas as pd
from nimbusml import Pipeline
from nimbusml.ensemble import FastTreesBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \
    OneHotVectorizer
from sklearn.model_selection import GridSearchCV

df = pd.DataFrame(
    dict(education=['A', 'B', 'A', 'B', 'A'],
         workclass=['X', 'X', 'Y', 'Y', 'Y'],
         y=[1, 0, 1, 0, 0]))
X = df.drop('y', axis=1)
y = df['y']
pipe = Pipeline([
    ('cat', OneHotVectorizer() << 'education'),
    # unnamed step, stays same in grid search
    OneHotHashVectorizer() << 'workclass',
    # this instance of FastTreesBinaryClassifier with num_trees 0 will be
    # never run by grid search as its not a part of param_grid below
    ('learner', FastTreesBinaryClassifier(num_trees=0, num_leaves=2))
])

param_grid = dict(cat__output_kind=['Ind', 'Bin'],
                  learner__num_trees=[1, 2, 3])
grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn')

grid.fit(X, y)
print(grid.best_params_)
# {'cat__output_kind': 'Ind', 'learner__num_trees': 1}
예제 #30
0
# the shape function for the given feature evaluated at the feature value.
lr_feature_contributions = lr_model.get_feature_contributions(data)

# Print predictions with feature contributions, which give a relative measure
# of how much each feature impacted the Score.
print("========== Feature Contributions for Linear Model ==========")
print(lr_feature_contributions.head())
#   label  ... PredictedLabel     Score ... FeatureContributions.hours-per-week
# 0     0  ...              0 -2.010687 ...                            0.833069
# 1     0  ...              0 -1.216163 ...                            0.809928
# 2     1  ...              0 -1.248412 ...                            0.485957
# 3     1  ...              0 -1.132419 ...                            0.583148
# 4     0  ...              0 -1.969522 ...                            0.437361

# define the training pipeline with a tree model
tree_pipeline = Pipeline([FastTreesBinaryClassifier(
    feature=['age', 'education-num', 'hours-per-week'], label='label')])

# train the model
tree_model = tree_pipeline.fit(data)

# For tree-based models, the calculation of feature contribution essentially
# consists in determining which splits in the tree have the most impact on the
# final score and assigning the value of the impact to the features determining
# the split. More precisely, the contribution of a feature is equal to the
# change in score produced by exploring the opposite sub-tree every time a
# decision node for the given feature is encountered.
# 
# Consider a simple case with a single decision tree that has a decision node
# for the binary feature F1. Given an example that has feature F1 equal to
# true, we can calculate the score it would have obtained if we chose the
# subtree corresponding to the feature F1 being equal to false while keeping