예제 #1
0
 def test_pass_decision_function_multiclass_3class(self):
     clf = FastLinearClassifier(number_of_threads=1)
     clf.fit(X_train_3class, y_train_3class)
     s = clf.decision_function(X_test_3class).sum()
     assert_almost_equal(s,
                         38.0,
                         decimal=4,
                         err_msg=invalid_decision_function_output)
     assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'})
예제 #2
0
 def test_pass_predict_proba_multiclass_3class(self):
     clf = FastLinearClassifier(number_of_threads=1)
     clf.fit(X_train_3class, y_train_3class, verbose=0)
     s = clf.predict_proba(X_test_3class).sum()
     assert_almost_equal(s,
                         38.0,
                         decimal=4,
                         err_msg=invalid_predict_proba_output)
     assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'})
예제 #3
0
 def test_pass_predict_proba_multiclass_3class_retains_classes_type(self):
     clf = FastLinearClassifier(number_of_threads=1)
     clf.fit(X_train_3class_int, y_train_3class_int)
     s = clf.predict_proba(X_test_3class_int).sum()
     assert_almost_equal(s,
                         38.0,
                         decimal=4,
                         err_msg=invalid_predict_proba_output)
     assert_equal(set(clf.classes_), {0, 1, 2})
예제 #4
0
 def test_text_label(self):
     X, y = get_iris()
     ap = FastLinearClassifier(
         feature=[
             'Sepal_Width',
             'Sepal_Length',
             'Petal_Width',
             'Petal_Length'])
     ap.fit(X, y)
     scores = ap.predict(X)
     assert str(scores.dtype) == "object"
예제 #5
0
    def test_unseen_classes(self):
        # Create a dataset such that cv splits miss some of the classes
        X = random_df()
        y = random_series()
        y[95:] = range(5)

        msg = 'CV didn\'t raise Warning exception b/c of minority class issue'
        with self.assertRaises(Warning, msg=msg):
            cv = CV([FastLinearClassifier()])
            cv.fit(X, y, cv=3)
예제 #6
0
 def test_pass_predict_proba_multiclass_with_pipeline(self):
     algos = [
         LogisticRegressionClassifier(),
         FastLinearClassifier(),
         LightGbmClassifier()
     ]
     for algo in algos:
         assert_almost_equal(proba_sum(Pipeline([algo])),
                             38.0,
                             decimal=3,
                             err_msg=invalid_predict_proba_output)
    def setUpClass(self):
        adult_path = get_dataset('uciadult_train').as_filepath()
        self.classification_data = FileDataStream.read_csv(adult_path)
        binary_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            LogisticRegressionBinaryClassifier(feature=['age', 'education'],
                                               label='label',
                                               number_of_threads=1)
        ])
        self.binary_model = binary_pipeline.fit(self.classification_data)
        self.binary_pfi = self.binary_model.permutation_feature_importance(
            self.classification_data)
        classifier_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearClassifier(feature=['age', 'education'],
                                 label='label',
                                 number_of_threads=1,
                                 shuffle=False)
        ])
        self.classifier_model = classifier_pipeline.fit(
            self.classification_data)
        self.classifier_pfi = self.classifier_model.permutation_feature_importance(
            self.classification_data)

        infert_path = get_dataset('infert').as_filepath()
        self.regression_data = FileDataStream.read_csv(infert_path)
        regressor_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearRegressor(feature=['induced', 'education'],
                                label='age',
                                number_of_threads=1,
                                shuffle=False)
        ])
        self.regressor_model = regressor_pipeline.fit(self.regression_data)
        self.regressor_pfi = self.regressor_model.permutation_feature_importance(
            self.regression_data)

        ticket_path = get_dataset('gen_tickettrain').as_filepath()
        self.ranking_data = FileDataStream.read_csv(ticket_path)
        ranker_pipeline = Pipeline([
            ToKey(columns=['group']),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group',
                           random_state=0,
                           number_of_threads=1)
        ])
        self.ranker_model = ranker_pipeline.fit(self.ranking_data)
        self.ranker_pfi = self.ranker_model.permutation_feature_importance(
            self.ranking_data)
예제 #8
0
    def test_decision_function_multiclass_3class_no_y_input_implies_no_classes_attribute(
            self):
        X_train = X_train_3class_int.join(y_train_3class_int)
        X_test = X_test_3class_int.join(y_test_3class_int)

        clf = FastLinearClassifier(number_of_threads=1, label='Label')
        clf.fit(X_train)

        if hasattr(clf, 'classes_'):
            # The classes_ attribute is currently not supported
            # when fitting when there is no y input specified.
            self.fail("classes_ attribute not expected.")

        s = clf.decision_function(X_test).sum()
        assert_almost_equal(s,
                            38.0,
                            decimal=4,
                            err_msg=invalid_decision_function_output)

        if hasattr(clf, 'classes_'):
            # The classes_ attribute is currently not supported
            # when predicting when there was no y input specified
            # during fitting.
            self.fail("classes_ attribute not expected.")
예제 #9
0
    def test_label_column_for_classifier_specified_as_argument(self):
        train_data = {
            'c1': [2, 3, 4, 5],
            'c2': [3, 4, 5, 6],
            'c3': [4, 5, 6, 7],
            'd1': [0, 1, 2, 1]
        }
        train_df = pd.DataFrame(train_data)

        predictor = FastLinearClassifier(label='d1')
        pipeline = Pipeline([predictor])
        result = json.loads(pipeline.fit(train_df, dry_run=True))

        self.verify_classifier_nodes(
            result, "d1", ['c1', 'c2', 'c3'],
            "Trainers.StochasticDualCoordinateAscentClassifier")
예제 #10
0
    def test_default_label_for_classifier_without_label_column(self):
        train_data = {
            'c1': [2, 3, 4, 5],
            'c2': [3, 4, 5, 6],
            'c3': [4, 5, 6, 7],
            'c4': [0, 1, 2, 1]
        }
        train_df = pd.DataFrame(train_data)

        predictor = FastLinearClassifier()
        pipeline = Pipeline([predictor])
        result = json.loads(pipeline.fit(train_df, dry_run=True))

        self.verify_classifier_nodes(
            result, "Label", ['c1', 'c2', 'c3', 'c4'],
            "Trainers.StochasticDualCoordinateAscentClassifier")
예제 #11
0
    def test_pass_predict_proba_multiclass_with_pipeline_adds_classes(self):
        clf = FastLinearClassifier(number_of_threads=1)
        pipeline = Pipeline([clf])
        pipeline.fit(X_train_3class, y_train_3class)

        expected_classes = {'Blue', 'Green', 'Red'}
        assert_equal(set(clf.classes_), expected_classes)
        assert_equal(set(pipeline.classes_), expected_classes)

        s = pipeline.predict_proba(X_test_3class).sum()
        assert_almost_equal(s,
                            38.0,
                            decimal=4,
                            err_msg=invalid_predict_proba_output)

        assert_equal(set(clf.classes_), expected_classes)
        assert_equal(set(pipeline.classes_), expected_classes)
###############################################################################
# FastLinearClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import FastLinearClassifier
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)

df = get_dataset("iris").as_df()
df.drop(['Species'], inplace=True, axis=1)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])
lr = FastLinearClassifier().fit(X_train, y_train)

scores = lr.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
예제 #13
0
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \
              'col=education:TX:2 col=marital-status:TX:3 ' \
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
예제 #14
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    FastLinearClassifier(feature=['age', 'edu', 'parity'], label='induced')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2
# 0               2  0.015312  0.058199  0.926489
# 1               0  0.892915  0.097093  0.009991
# 2               2  0.058976  0.123581  0.817444
# 3               2  0.287882  0.245397  0.466721
# 4               0  0.404075  0.362293  0.233632
예제 #15
0
 def test_FastLinearClassifier(self):
     acc = get_accuracy(self, FastLinearClassifier())
     assert_almost_equal(acc,
                         0.97368421052,
                         decimal=8,
                         err_msg="Sum should be %s" % 0.97368421052)
#               FeatureName  AreaUnderRocCurve  AreaUnderRocCurve.StdErr  ...
# 0                     age          -0.081604                       0.0  ...
# 6   education.Prof-school          -0.012964                       0.0  ...
# 10    education.Doctorate          -0.012863                       0.0  ...
# 8     education.Bachelors          -0.010593                       0.0  ...
# 2       education.HS-grad          -0.005918                       0.0  ...

###############################
# PFI for Classification models
###############################
# define the training pipeline with a classifier
# use 1 thread and no shuffling to force determinism
multiclass_pipeline = Pipeline([
    OneHotVectorizer(columns=['education']),
    FastLinearClassifier(feature=['age', 'education'],
                         label='label',
                         number_of_threads=1,
                         shuffle=False)
])

# train the model
multiclass_model = multiclass_pipeline.fit(classification_data)

# get permutation feature importance
multiclass_pfi = multiclass_model.permutation_feature_importance(
    classification_data)

# Print PFI for each feature, ordered by most important features w.r.t. Macro
# accuracy. Since Macro accuracy is an increasing metric, the highest negative
# changes indicate the most important features.
print("================== PFI for Classification Model ==================")
print(multiclass_pfi.sort_values('MacroAccuracy').head())