def test_learners_sweep(self): # grid search over 2 learners, even though pipe defined with # FastTreesBinaryClassifier # FastLinearBinaryClassifier learner wins, meaning we grid searched # over it np.random.seed(0) df = pd.DataFrame( dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] cat = OneHotVectorizer() << ['education', 'workclass'] learner = FastTreesBinaryClassifier() pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner=[ FastLinearBinaryClassifier(), FastTreesBinaryClassifier() ], learner__number_of_threads=[1, 4]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) assert grid.best_params_[ 'learner'].__class__.__name__ == 'FastLinearBinaryClassifier' assert grid.best_params_['learner__number_of_threads'] == 1
def test_default_label(self): df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] # 1 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) << { Role.Label: 'Label', Role.Feature: 'Features' } ]) model = pipeline.fit(df, verbose=0) probabilities0 = model.predict_proba(df) # 2 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) << { Role.Feature: 'Features' } ]) model = pipeline.fit(df, verbose=0) probabilities = model.predict_proba(df) assert_array_almost_equal(probabilities0, probabilities) # 3 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) ]) model = pipeline.fit(df, verbose=0) probabilities = model.predict_proba(df) assert_array_almost_equal(probabilities0, probabilities) # 4 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) << { Role.Label: 'Label' } ]) model = pipeline.fit(df, verbose=0) probabilities = model.predict_proba(df) assert_array_almost_equal(probabilities0, probabilities)
def test_hyperparameters_sweep(self): # general test with combination of named and unnamed steps np.random.seed(0) df = pd.DataFrame( dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', # number_of_trees 0 will actually be never run by grid search ('learner', FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2)) ]) param_grid = dict(cat__output_kind=['Indicator', 'Binary'], learner__number_of_trees=[1, 2, 3]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) print(grid.best_params_) assert grid.best_params_ == { 'cat__output_kind': 'Indicator', 'learner__number_of_trees': 1 }
def test_experiment_loadsavemodel(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') cat = OneHotVectorizer() << categorical_columns ftree = FastTreesBinaryClassifier() pipeline = Pipeline([cat, ftree]) pipeline.fit(train, label) metrics1, scores1 = pipeline.test(test, label1, 'binary', output_scores=True) sum1 = metrics1.sum().sum() (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin') fl = os.fdopen(fd, 'w') fl.close() pipeline.save_model(modelfilename) pipeline2 = Pipeline() pipeline2.load_model(modelfilename) metrics2, scores2 = pipeline2.test(test, label1, 'binary', output_scores=True) sum2 = metrics2.sum().sum() assert_equal(sum1, sum2, "model metrics don't match after loading model")
def test_ovr_accuracy(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(number_of_threads=1), FastForestBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), GamBinaryClassifier(number_of_threads=1), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), FastLinearBinaryClassifier(number_of_threads=1), SgdBinaryClassifier(number_of_threads=1), # SymSgdBinaryClassifier(number_of_threads=1), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True) metrics = accuracy(ovr) accu = metrics['Accuracy(micro-avg)'][0] # algos will have wide range of accuracy, so use low bar. Also # checks Pipeline + Ova + clf assert_greater( accu, 0.65, "{} accuracy is too low {}".format(clf.__class__, accu))
def test_pickle_pipeline(self): np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() pipe = Pipeline(steps=[("cat", cat), ("ftree", ftree)]) pipe.fit(X_train, y_train) scores = pipe.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores.values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(pipe) os.remove(cat.model_) os.remove(ftree.model_) pipe2 = pickle.loads(s) scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) assert_equal( accu1, accu2, "accuracy mismatch after unpickling pipeline")
def test_pickle_pipeline_and_nimbusml_pipeline(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) scores = skpipe.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores["PredictedLabel"].values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(skpipe) pipe2 = pickle.loads(s) scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2["PredictedLabel"].values) assert_equal( accu1, accu2, "accuracy mismatch after unpickling pipeline") assert_frame_equal(scores, scores2)
def test_pipeline_clone(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) scores = skpipe.predict(X_test) copy = clone(skpipe) scores2 = copy.predict(X_test) assert_frame_equal(scores, scores2) # checks we can fit again skpipe.fit(X_train, y_train) scores3 = skpipe.predict(X_test) assert_frame_equal(scores, scores3)
def test_clone_sweep(self): # grid search, then clone pipeline and grid search again # results should be same np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) pipe1 = pipe.clone() grid1 = GridSearchCV(pipe1, param_grid) grid1.fit(X_train, y_train) assert grid.best_params_[ 'learner__number_of_trees'] == grid1.best_params_[ 'learner__number_of_trees']
def test_pipeline_grid_search(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier(number_of_trees=5) pipe = Pipeline( steps=[ ("cat", cat), ('pca', PCA(5)), ("ftree", ftree)]) grid = GridSearchCV(pipe, dict(pca__n_components=[2], ftree__number_of_trees=[11])) grid.fit(X_train, y_train) assert grid.best_params_ == { 'ftree__number_of_trees': 11, 'pca__n_components': 2} steps = grid.best_estimator_.steps ft = steps[-1][1] number_of_trees = ft.number_of_trees assert number_of_trees == 11
def test_trees(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier()]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_trees_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier() << { 'Label': label_column}]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_parallel(self): (train, label) = get_X_y(train_file, label_column, sep=',') cat = OneHotVectorizer() << categorical_columns ftree = FastTreesBinaryClassifier() pipeline = Pipeline([cat, ftree]) result = pipeline.fit(train, label, parallel=8) result2 = pipeline.fit(train, label, parallel=1) assert_true(result == result2)
def test_trees(self): np.random.seed(0) (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipe = Pipeline(steps=[('cat', OneHotVectorizer() << categorical_columns ), ('linear', FastTreesBinaryClassifier())]) pipe.fit(train, label) out_data = pipe.predict(test) check_accuracy_scikit(test_file, label_column, out_data, 0.77)
def test_learners_sweep(self): # grid search over 2 learners np.random.seed(0) df = pd.DataFrame( dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] cat = OneHotVectorizer() << ['education', 'workclass'] learner = FastTreesBinaryClassifier() pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner=[ AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier() ]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) assert grid.best_params_[ 'learner'].__class__.__name__ == 'AveragedPerceptronBinaryClassifier'
def test_error_conditions(self): # grid search on a wrong param np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__wrong_arg=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) assert_raises(ValueError, grid.fit, X_train, y_train)
def test_failing_decision_function_called_with_use_probabilites_true(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(min_split=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(min_split=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True) check_decision_function_when_trained_with_use_probabilites_true( self, ovr, clf)
def test_failing_predict_proba_called_with_use_probabilites_false(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False) check_predict_proba_when_trained_with_use_probabilites_false( self, ovr, clf)
def test_uciadult_sweep(self): # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns # number_of_trees 100 will actually be never run by grid search # as its not in param_grid below learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) assert grid.best_params_['learner__number_of_trees'] == 10 # compare AUC on number_of_trees 1, 5, 10 pipe.set_params(learner__number_of_trees=1) pipe.fit(X_train, y_train) metrics1, _ = pipe.test(X_train, y_train) pipe.set_params(learner__number_of_trees=5) pipe.fit(X_train, y_train) metrics5, _ = pipe.test(X_train, y_train) pipe.set_params(learner__number_of_trees=10) pipe.fit(X_train, y_train) metrics10, _ = pipe.test(X_train, y_train) assert metrics10['AUC'][0] > metrics5['AUC'][0] assert metrics10['AUC'][0] > metrics1['AUC'][0] assert metrics10['AUC'][0] > 0.59
def test_decision_function_produces_distribution_not_sum_to_1(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False) scoremean = decfun_average(ovr) assert_not_equal( scoremean, 1.0, '{} raw scores should not sum to 1.0 over 3 classes'.format( clf.__class__))
def test_pipeline_get_params(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) pars = skpipe.get_params(deep=True) assert 'steps' in pars step = pars['steps'][0] assert len(step) == 2 assert 'nimbusml' in pars assert 'nimbusml__random_state' in pars assert 'nimbusml__steps' in pars
def test_pickle_predictor(self): np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) ftree = FastTreesBinaryClassifier().fit(X_train, y_train) scores = ftree.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores.values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(ftree) ftree2 = pickle.loads(s) scores2 = ftree2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) assert_equal(accu1, accu2, "accuracy mismatch after unpickling predictor")
def test_predict_proba_produces_distribution_sum_to_1(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # TODO: why symsgd does not sum to 1.0 # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf) probmean = proba_average(ovr) assert_equal( probmean, 1.0, '{} probabilites {} do not sum to 1.0 over 3 classes'.format( clf.__class__, probmean))
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), FastTreesBinaryClassifier(feature=['age', 'edu'], label='case') ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Probability Score # 0 0 0.330738 -1.762120 # 1 0 0.337897 -1.681700 # 2 0 0.334428 -1.720559 # 3 0 0.331255 -1.756292 # 4 0 0.333299 -1.733252 # print evaluation metrics
path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), OneVsRestClassifier( # using a binary classifier + OVR for multiclass dataset FastTreesBinaryClassifier(), # True = class probabilities will sum to 1.0 # False = raw scores, unknown range use_probabilities=True, feature=['age', 'edu'], label='induced') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 0.084504 0.302600 0.612897
def test_pipeline_exports_complex(self): name = "test_pipeline_exports_complex.csv" with open(name, "w") as f: f.write(_sentiments) transform_1 = NGramFeaturizer() << {'transformed1': 'SentimentText'} transform_2 = OneHotVectorizer() << 'SentimentSource' transform_3 = ColumnConcatenator() << { 'finalfeatures': ['transformed1', 'SentimentSource'] } algo = FastTreesBinaryClassifier() << { Role.Feature: 'finalfeatures', Role.Label: "Positive" } exp = Pipeline([transform_1, transform_2, transform_3, algo]) stream = FileDataStream.read_csv(name, sep="\t") res = dot_export_pipeline(exp, stream).strip("\n\r ") exp = """ digraph{ orientation=portrait; sch0[label="<f0> ItemID|<f1> Sentiment|<f2> \ SentimentSource|<f3> SentimentText|<f4> RowNum|<f5> \ Positive|<f6> Train|<f7> Small",shape=record,fontsize=8]; node1[label="NGramFeaturizer",shape=box,style="filled,\ rounded",color=cyan,fontsize=12]; sch0:f3 -> node1; sch1[label="<f0> transformed1|<f1> \ transformed1_TransformedText",shape=record,fontsize=8]; node1 -> sch1:f0; node1 -> sch1:f1; node2[label="OneHotVectorizer",shape=box,\ style="filled,rounded",color=cyan,fontsize=12]; sch0:f2 -> node2; sch2[label="<f0> SentimentSource",shape=record,\ fontsize=8]; node2 -> sch2:f0; node3[label="ColumnConcatenator",shape=box,\ style="filled,rounded",color=cyan,fontsize=12]; sch1:f0 -> node3; sch2:f0 -> node3; sch3[label="<f0> finalfeatures",shape=record,fontsize=8]; node3 -> sch3:f0; node4[label="FastTreesBinaryClassifier",shape=box,\ style="filled,rounded",color=yellow,fontsize=12]; sch3:f0 -> node4 [label="Feature",fontsize=8]; sch0:f5 -> node4 [label="Label",fontsize=8]; sch4[label="<f0> PredictedLabel|<f1> \ PredictedProba|<f2> Score",shape=record,fontsize=8]; node4 -> sch4:f0; node4 -> sch4:f1; node4 -> sch4:f2; } """.replace(" ", "").strip("\n\r ") assert res == exp
import pandas as pd from nimbusml import Pipeline from nimbusml.ensemble import FastTreesBinaryClassifier, GamBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotHashVectorizer from nimbusml.linear_model import FastLinearBinaryClassifier, \ LogisticRegressionBinaryClassifier from sklearn.model_selection import GridSearchCV df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] cat = OneHotHashVectorizer() << ['education', 'workclass'] learner = FastTreesBinaryClassifier() pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(cat__hash_bits=[1, 2, 4, 6, 8, 16], learner=[ FastLinearBinaryClassifier(), FastTreesBinaryClassifier(), LogisticRegressionBinaryClassifier(), GamBinaryClassifier() ]) grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn', ) grid.fit(X, y) print(grid.best_params_['learner'].__class__.__name__) # FastLinearBinaryClassifier print(grid.best_params_['cat__hash_bits'])
SgdBinaryClassifier(), # Error on linux # Unable to load shared library 'SymSgdNative' or one of its dependencies #SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), OneVsRestClassifier(FastLinearBinaryClassifier()), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), FactorizationMachineBinaryClassifier(), KMeansPlusPlus(n_clusters=2), NaiveBayesClassifier(), FastForestBinaryClassifier(number_of_trees=2), FastForestRegressor(number_of_trees=2), FastTreesBinaryClassifier(number_of_trees=2), FastTreesRegressor(number_of_trees=2), FastTreesTweedieRegressor(number_of_trees=2), LightGbmRegressor(number_of_iterations=2), LightGbmClassifier(), LightGbmBinaryClassifier(number_of_iterations=2) ] learners_not_supported = [ #PcaTransformer(), # REVIEW: crashes ] class TestModelSummary(unittest.TestCase): def test_model_summary(self):
# GridSearchCV with Pipeline: hyperparameter grid search. import pandas as pd from nimbusml import Pipeline from nimbusml.ensemble import FastTreesBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \ OneHotVectorizer from sklearn.model_selection import GridSearchCV df = pd.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', # this instance of FastTreesBinaryClassifier with num_trees 0 will be # never run by grid search as its not a part of param_grid below ('learner', FastTreesBinaryClassifier(num_trees=0, num_leaves=2)) ]) param_grid = dict(cat__output_kind=['Ind', 'Bin'], learner__num_trees=[1, 2, 3]) grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn') grid.fit(X, y) print(grid.best_params_) # {'cat__output_kind': 'Ind', 'learner__num_trees': 1}
# the shape function for the given feature evaluated at the feature value. lr_feature_contributions = lr_model.get_feature_contributions(data) # Print predictions with feature contributions, which give a relative measure # of how much each feature impacted the Score. print("========== Feature Contributions for Linear Model ==========") print(lr_feature_contributions.head()) # label ... PredictedLabel Score ... FeatureContributions.hours-per-week # 0 0 ... 0 -2.010687 ... 0.833069 # 1 0 ... 0 -1.216163 ... 0.809928 # 2 1 ... 0 -1.248412 ... 0.485957 # 3 1 ... 0 -1.132419 ... 0.583148 # 4 0 ... 0 -1.969522 ... 0.437361 # define the training pipeline with a tree model tree_pipeline = Pipeline([FastTreesBinaryClassifier( feature=['age', 'education-num', 'hours-per-week'], label='label')]) # train the model tree_model = tree_pipeline.fit(data) # For tree-based models, the calculation of feature contribution essentially # consists in determining which splits in the tree have the most impact on the # final score and assigning the value of the impact to the features determining # the split. More precisely, the contribution of a feature is equal to the # change in score produced by exploring the opposite sub-tree every time a # decision node for the given feature is encountered. # # Consider a simple case with a single decision tree that has a decision node # for the binary feature F1. Given an example that has feature F1 equal to # true, we can calculate the score it would have obtained if we chose the # subtree corresponding to the feature F1 being equal to false while keeping