def test_pass_decision_function_multiclass_3class(self): clf = FastLinearClassifier(number_of_threads=1) clf.fit(X_train_3class, y_train_3class) s = clf.decision_function(X_test_3class).sum() assert_almost_equal(s, 38.0, decimal=4, err_msg=invalid_decision_function_output) assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'})
def test_pass_predict_proba_multiclass_3class(self): clf = FastLinearClassifier(number_of_threads=1) clf.fit(X_train_3class, y_train_3class, verbose=0) s = clf.predict_proba(X_test_3class).sum() assert_almost_equal(s, 38.0, decimal=4, err_msg=invalid_predict_proba_output) assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'})
def test_pass_predict_proba_multiclass_3class_retains_classes_type(self): clf = FastLinearClassifier(number_of_threads=1) clf.fit(X_train_3class_int, y_train_3class_int) s = clf.predict_proba(X_test_3class_int).sum() assert_almost_equal(s, 38.0, decimal=4, err_msg=invalid_predict_proba_output) assert_equal(set(clf.classes_), {0, 1, 2})
def test_text_label(self): X, y = get_iris() ap = FastLinearClassifier( feature=[ 'Sepal_Width', 'Sepal_Length', 'Petal_Width', 'Petal_Length']) ap.fit(X, y) scores = ap.predict(X) assert str(scores.dtype) == "object"
def test_unseen_classes(self): # Create a dataset such that cv splits miss some of the classes X = random_df() y = random_series() y[95:] = range(5) msg = 'CV didn\'t raise Warning exception b/c of minority class issue' with self.assertRaises(Warning, msg=msg): cv = CV([FastLinearClassifier()]) cv.fit(X, y, cv=3)
def test_pass_predict_proba_multiclass_with_pipeline(self): algos = [ LogisticRegressionClassifier(), FastLinearClassifier(), LightGbmClassifier() ] for algo in algos: assert_almost_equal(proba_sum(Pipeline([algo])), 38.0, decimal=3, err_msg=invalid_predict_proba_output)
def setUpClass(self): adult_path = get_dataset('uciadult_train').as_filepath() self.classification_data = FileDataStream.read_csv(adult_path) binary_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), LogisticRegressionBinaryClassifier(feature=['age', 'education'], label='label', number_of_threads=1) ]) self.binary_model = binary_pipeline.fit(self.classification_data) self.binary_pfi = self.binary_model.permutation_feature_importance( self.classification_data) classifier_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearClassifier(feature=['age', 'education'], label='label', number_of_threads=1, shuffle=False) ]) self.classifier_model = classifier_pipeline.fit( self.classification_data) self.classifier_pfi = self.classifier_model.permutation_feature_importance( self.classification_data) infert_path = get_dataset('infert').as_filepath() self.regression_data = FileDataStream.read_csv(infert_path) regressor_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearRegressor(feature=['induced', 'education'], label='age', number_of_threads=1, shuffle=False) ]) self.regressor_model = regressor_pipeline.fit(self.regression_data) self.regressor_pfi = self.regressor_model.permutation_feature_importance( self.regression_data) ticket_path = get_dataset('gen_tickettrain').as_filepath() self.ranking_data = FileDataStream.read_csv(ticket_path) ranker_pipeline = Pipeline([ ToKey(columns=['group']), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group', random_state=0, number_of_threads=1) ]) self.ranker_model = ranker_pipeline.fit(self.ranking_data) self.ranker_pfi = self.ranker_model.permutation_feature_importance( self.ranking_data)
def test_decision_function_multiclass_3class_no_y_input_implies_no_classes_attribute( self): X_train = X_train_3class_int.join(y_train_3class_int) X_test = X_test_3class_int.join(y_test_3class_int) clf = FastLinearClassifier(number_of_threads=1, label='Label') clf.fit(X_train) if hasattr(clf, 'classes_'): # The classes_ attribute is currently not supported # when fitting when there is no y input specified. self.fail("classes_ attribute not expected.") s = clf.decision_function(X_test).sum() assert_almost_equal(s, 38.0, decimal=4, err_msg=invalid_decision_function_output) if hasattr(clf, 'classes_'): # The classes_ attribute is currently not supported # when predicting when there was no y input specified # during fitting. self.fail("classes_ attribute not expected.")
def test_label_column_for_classifier_specified_as_argument(self): train_data = { 'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1] } train_df = pd.DataFrame(train_data) predictor = FastLinearClassifier(label='d1') pipeline = Pipeline([predictor]) result = json.loads(pipeline.fit(train_df, dry_run=True)) self.verify_classifier_nodes( result, "d1", ['c1', 'c2', 'c3'], "Trainers.StochasticDualCoordinateAscentClassifier")
def test_default_label_for_classifier_without_label_column(self): train_data = { 'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1] } train_df = pd.DataFrame(train_data) predictor = FastLinearClassifier() pipeline = Pipeline([predictor]) result = json.loads(pipeline.fit(train_df, dry_run=True)) self.verify_classifier_nodes( result, "Label", ['c1', 'c2', 'c3', 'c4'], "Trainers.StochasticDualCoordinateAscentClassifier")
def test_pass_predict_proba_multiclass_with_pipeline_adds_classes(self): clf = FastLinearClassifier(number_of_threads=1) pipeline = Pipeline([clf]) pipeline.fit(X_train_3class, y_train_3class) expected_classes = {'Blue', 'Green', 'Red'} assert_equal(set(clf.classes_), expected_classes) assert_equal(set(pipeline.classes_), expected_classes) s = pipeline.predict_proba(X_test_3class).sum() assert_almost_equal(s, 38.0, decimal=4, err_msg=invalid_predict_proba_output) assert_equal(set(clf.classes_), expected_classes) assert_equal(set(pipeline.classes_), expected_classes)
############################################################################### # FastLinearClassifier import numpy as np from nimbusml.datasets import get_dataset from nimbusml.linear_model import FastLinearClassifier from sklearn.model_selection import train_test_split # use 'iris' data set to create test and train data # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = FastLinearClassifier().fit(X_train, y_train) scores = lr.predict(X_test) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \ 'col=education:TX:2 col=marital-status:TX:3 ' \ 'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), FastLinearClassifier(feature=['age', 'edu', 'parity'], label='induced') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 0.015312 0.058199 0.926489 # 1 0 0.892915 0.097093 0.009991 # 2 2 0.058976 0.123581 0.817444 # 3 2 0.287882 0.245397 0.466721 # 4 0 0.404075 0.362293 0.233632
def test_FastLinearClassifier(self): acc = get_accuracy(self, FastLinearClassifier()) assert_almost_equal(acc, 0.97368421052, decimal=8, err_msg="Sum should be %s" % 0.97368421052)
# FeatureName AreaUnderRocCurve AreaUnderRocCurve.StdErr ... # 0 age -0.081604 0.0 ... # 6 education.Prof-school -0.012964 0.0 ... # 10 education.Doctorate -0.012863 0.0 ... # 8 education.Bachelors -0.010593 0.0 ... # 2 education.HS-grad -0.005918 0.0 ... ############################### # PFI for Classification models ############################### # define the training pipeline with a classifier # use 1 thread and no shuffling to force determinism multiclass_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearClassifier(feature=['age', 'education'], label='label', number_of_threads=1, shuffle=False) ]) # train the model multiclass_model = multiclass_pipeline.fit(classification_data) # get permutation feature importance multiclass_pfi = multiclass_model.permutation_feature_importance( classification_data) # Print PFI for each feature, ordered by most important features w.r.t. Macro # accuracy. Since Macro accuracy is an increasing metric, the highest negative # changes indicate the most important features. print("================== PFI for Classification Model ==================") print(multiclass_pfi.sort_values('MacroAccuracy').head())