Пример #1
0
    def test_pickle_pipeline(self):
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        pipe = Pipeline(steps=[("cat", cat), ("ftree", ftree)])
        pipe.fit(X_train, y_train)

        scores = pipe.predict(X_test)
        accu1 = np.mean(y_test.values.ravel() == scores.values)

        # Unpickle model and score. We should get the exact same accuracy as
        # above
        s = pickle.dumps(pipe)
        os.remove(cat.model_)
        os.remove(ftree.model_)
        pipe2 = pickle.loads(s)

        scores2 = pipe2.predict(X_test)
        accu2 = np.mean(y_test.values.ravel() == scores2.values)
        assert_equal(
            accu1,
            accu2,
            "accuracy mismatch after unpickling pipeline")
Пример #2
0
    def test_pipeline_clone(self):
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        nimbusmlpipe = nimbusmlPipeline([cat, ftree])
        skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)])
        skpipe.fit(X_train, y_train)

        scores = skpipe.predict(X_test)

        copy = clone(skpipe)
        scores2 = copy.predict(X_test)
        assert_frame_equal(scores, scores2)

        # checks we can fit again
        skpipe.fit(X_train, y_train)
        scores3 = skpipe.predict(X_test)
        assert_frame_equal(scores, scores3)
Пример #3
0
    def test_pickle_pipeline_and_nimbusml_pipeline(self):
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        nimbusmlpipe = nimbusmlPipeline([cat, ftree])
        skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)])
        skpipe.fit(X_train, y_train)

        scores = skpipe.predict(X_test)
        accu1 = np.mean(y_test.values.ravel() == scores["PredictedLabel"].values)

        # Unpickle model and score. We should get the exact same accuracy as
        # above
        s = pickle.dumps(skpipe)
        pipe2 = pickle.loads(s)
        scores2 = pipe2.predict(X_test)
        accu2 = np.mean(y_test.values.ravel() == scores2["PredictedLabel"].values)
        assert_equal(
            accu1,
            accu2,
            "accuracy mismatch after unpickling pipeline")
        assert_frame_equal(scores, scores2)
Пример #4
0
    def test_experiment_loadsavemodel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        (test, label1) = get_X_y(test_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])
        pipeline.fit(train, label)
        metrics1, scores1 = pipeline.test(test,
                                          label1,
                                          'binary',
                                          output_scores=True)
        sum1 = metrics1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        metrics2, scores2 = pipeline2.test(test,
                                           label1,
                                           'binary',
                                           output_scores=True)
        sum2 = metrics2.sum().sum()

        assert_equal(sum1, sum2,
                     "model metrics don't match after loading model")
Пример #5
0
    def test_pipeline_grid_search(self):
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier(number_of_trees=5)
        pipe = Pipeline(
            steps=[
                ("cat", cat), ('pca', PCA(5)), ("ftree", ftree)])

        grid = GridSearchCV(pipe, dict(pca__n_components=[2],
                                       ftree__number_of_trees=[11]))
        grid.fit(X_train, y_train)
        assert grid.best_params_ == {
            'ftree__number_of_trees': 11,
            'pca__n_components': 2}
        steps = grid.best_estimator_.steps
        ft = steps[-1][1]
        number_of_trees = ft.number_of_trees
        assert number_of_trees == 11
Пример #6
0
    def test_clone_sweep(self):
        # grid search, then clone pipeline and grid search again
        # results should be same
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)
        grid.fit(X_train, y_train)

        pipe1 = pipe.clone()
        grid1 = GridSearchCV(pipe1, param_grid)
        grid1.fit(X_train, y_train)

        assert grid.best_params_[
            'learner__number_of_trees'] == grid1.best_params_[
                'learner__number_of_trees']
Пример #7
0
 def test_trees(self):
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier()])
     pipeline.fit(train, label)
     out_data = pipeline.predict(test)
     check_accuracy(test_file, label_column, out_data, 0.65)
Пример #8
0
 def test_linear_with_train_test_schema(self):
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastLinearBinaryClassifier(train_threads=1,
                                                     shuffle=False)])
     pipeline.fit(train, label)
     out_data = pipeline.predict(test)
     check_accuracy(test_file, label_column, out_data, 0.65)
Пример #9
0
 def test_trees(self):
     np.random.seed(0)
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipe = Pipeline(steps=[('cat',
                             OneHotVectorizer() << categorical_columns
                             ), ('linear', FastTreesBinaryClassifier())])
     pipe.fit(train, label)
     out_data = pipe.predict(test)
     check_accuracy_scikit(test_file, label_column, out_data, 0.77)
Пример #10
0
 def test_linear(self):
     np.random.seed(0)
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipe = Pipeline(steps=[('cat',
                             OneHotVectorizer() << categorical_columns),
                            ('linear',
                             FastLinearBinaryClassifier(
                                 shuffle=False, number_of_threads=1))])
     pipe.fit(train, label)
     out_data = pipe.predict(test)
     check_accuracy_scikit(test_file, label_column, out_data, 0.779)
Пример #11
0
def check_accuracy(test_file, label_column, predictions, threshold, sep=','):
    (test, label) = get_X_y(test_file, label_column, sep=sep)
    accuracy = np.mean(
        label[label_column].values == predictions.ix[:,
                                                     'PredictedLabel'].values)
    assert_greater(accuracy, threshold,
                   "accuracy should be greater than %s" % threshold)
Пример #12
0
    def test_error_conditions(self):
        # grid search on a wrong param
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',', encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',', encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__wrong_arg=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        assert_raises(ValueError, grid.fit, X_train, y_train)
Пример #13
0
    def test_lightgbmclassifier(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train,
         label) = get_X_y(train_file,
                          label_column='Sentiment',
                          sep='\t',
                          encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train, max_slots=5000)
        X_test = texttransform.transform(X_test, max_slots=5000)

        mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0)
        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test.values.ravel() == scores.values)
        assert_greater(
            accuracy,
            0.58,
            "accuracy should be greater than %s" %
            0.58)
Пример #14
0
    def test_naivebayesclassifier(self):
        np.random.seed(0)
        train_file = get_dataset("wiki_detox_train").as_filepath()
        (train, label) = get_X_y(train_file, label_column='Sentiment',
                                 sep='\t')
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train)
        X_test = texttransform.transform(X_test)

        mymodel = NaiveBayesClassifier()
        mymodel.fit(X_train, y_train)

        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])[0]
        assert_greater(
            accuracy,
            0.5,
            "accuracy should be greater than %s" %
            0.5)
Пример #15
0
 def test_feature_union(self):
     np.random.seed(0)
     (train, label) = get_X_y(train_file, label_column,
                              sep=',', features=selected_features)
     (test, label1) = get_X_y(test_file, label_column,
                              sep=',', features=selected_features)
     fu = FeatureUnion(transformer_list=[
         ('onehot', OneHotEncoder()),
         ('cat', OneHotVectorizer())
     ])
     pipe = Pipeline(
         steps=[
             ('fu', fu), ('linear', FastLinearBinaryClassifier(
                 shuffle=False, number_of_threads=1))])
     pipe.fit(train, label)
     out_data = pipe.predict(test)
     check_accuracy_scikit(test_file, label_column, out_data, 0.709)
Пример #16
0
    def test_parallel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])

        result = pipeline.fit(train, label, parallel=8)
        result2 = pipeline.fit(train, label, parallel=1)
        assert_true(result == result2)
Пример #17
0
    def test_uciadult_sweep(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        # number_of_trees 100 will actually be never run by grid search
        # as its not in param_grid below
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X_train, y_train)
        assert grid.best_params_['learner__number_of_trees'] == 10

        # compare AUC on number_of_trees 1, 5, 10
        pipe.set_params(learner__number_of_trees=1)
        pipe.fit(X_train, y_train)
        metrics1, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=5)
        pipe.fit(X_train, y_train)
        metrics5, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=10)
        pipe.fit(X_train, y_train)
        metrics10, _ = pipe.test(X_train, y_train)

        assert metrics10['AUC'][0] > metrics5['AUC'][0]
        assert metrics10['AUC'][0] > metrics1['AUC'][0]
        assert metrics10['AUC'][0] > 0.59
Пример #18
0
    def test_pipeline_get_params(self):

        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',',
                                   features=selected_features)
        if 'F1' in X_train.columns:
            raise Exception("F1 is in the dataset")
        cat = OneHotVectorizer() << 'age'
        ftree = FastTreesBinaryClassifier()
        nimbusmlpipe = nimbusmlPipeline([cat, ftree])
        skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)])
        skpipe.fit(X_train, y_train)
        pars = skpipe.get_params(deep=True)
        assert 'steps' in pars
        step = pars['steps'][0]
        assert len(step) == 2
        assert 'nimbusml' in pars
        assert 'nimbusml__random_state' in pars
        assert 'nimbusml__steps' in pars
Пример #19
0
    def test_pickle_predictor(self):
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     features=selected_features)
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   features=selected_features)

        ftree = FastTreesBinaryClassifier().fit(X_train, y_train)
        scores = ftree.predict(X_test)
        accu1 = np.mean(y_test.values.ravel() == scores.values)

        # Unpickle model and score. We should get the exact same accuracy as
        # above
        s = pickle.dumps(ftree)
        ftree2 = pickle.loads(s)
        scores2 = ftree2.predict(X_test)
        accu2 = np.mean(y_test.values.ravel() == scores2.values)
        assert_equal(accu1, accu2,
                     "accuracy mismatch after unpickling predictor")
    def test_ngramfeaturizer(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train, label) = get_X_y(train_file,
                                 label_column='Sentiment',
                                 sep='\t',
                                 encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train[:100])
        sum = X_train.iloc[:].sum().sum()
        assert_equal(sum, 30513, "sum of all features is incorrect!")
Пример #21
0
    def test_pickle_transform(self):
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     features=selected_features)
        cat = (OneHotVectorizer() << ['age']).fit(X_train, verbose=0)
        out1 = cat.transform(X_train)

        # Unpickle transform and generate output.
        # We should get the exact same output as above
        s = pickle.dumps(cat)
        cat2 = pickle.loads(s)
        out2 = cat2.transform(X_train)
        assert_equal(out1.sum().sum(),
                     out2.sum().sum(),
                     "data mismatch after unpickling transform")
Пример #22
0
    def test_performance_syntax(self):
        train_file = get_dataset('uciadult_train').as_filepath()
        test_file = get_dataset('uciadult_test').as_filepath()
        file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 ' \
                      'col=workclass:TX:1 col=education:TX:2 ' \
                      'col=marital-status:TX:3 col=occupation:TX:4 ' \
                      'col=relationship:TX:5 col=ethnicity:TX:6 ' \
                      'col=sex:TX:7 col=native-country-region:TX:8 header+'
        categorical_columns = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'ethnicity', 'sex', 'native-country-region'
        ]
        label_column = 'label'
        na_columns = ['Features']
        feature_columns_idv = na_columns + categorical_columns

        exp = Pipeline([
            OneHotHashVectorizer(columns=categorical_columns),
            Handler(columns=na_columns),
            FastLinearBinaryClassifier(feature=feature_columns_idv,
                                       label=label_column)
        ])

        train_data = FileDataStream(train_file, schema=file_schema)
        exp.fit(train_data, label_column, verbose=0)
        print("train time %s" % exp._run_time)

        test_data = FileDataStream(test_file, schema=file_schema)
        out_data = exp.predict(test_data)
        print("predict time %s" % exp._run_time)

        (test, label_test) = get_X_y(test_file, label_column, sep=',')
        (acc1, auc1) = evaluate_binary_classifier(
            label_test.iloc[:, 0].values,
            out_data.loc[:, 'PredictedLabel'].values,
            out_data.loc[:, 'Probability'].values)

        print('ACC %s, AUC %s' % (acc1, auc1))

        exp = Pipeline([
            OneHotHashVectorizer() << categorical_columns,
            Handler() << na_columns,
            FastLinearBinaryClassifier() << feature_columns_idv
        ])

        train_data = FileDataStream(train_file, schema=file_schema)
        exp.fit(train_data, label_column, verbose=0)
        print("train time %s" % exp._run_time)

        test_data = FileDataStream(test_file, schema=file_schema)
        out_data = exp.predict(test_data)
        print("predict time %s" % exp._run_time)

        (test, label_test) = get_X_y(test_file, label_column, sep=',')
        (acc2, auc2) = evaluate_binary_classifier(
            label_test.iloc[:, 0].values,
            out_data.loc[:, 'PredictedLabel'].values,
            out_data.loc[:, 'Probability'].values)
        print('ACC %s, AUC %s' % (acc2, auc2))
        assert abs(acc1 - acc2) < 0.02
        assert abs(auc1 - auc2) < 0.02
Пример #23
0
from nimbusml import Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import FastLinearBinaryClassifier
from nimbusml.utils import get_X_y
from numpy.testing import assert_almost_equal

train_file = get_dataset('uciadult_train').as_filepath()
test_file = get_dataset('uciadult_test').as_filepath()
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
label_column = 'label'
(train, label) = get_X_y(train_file, label_column, sep=',')
(test, test_label) = get_X_y(test_file, label_column, sep=',')


class TestLoadSave(unittest.TestCase):
    def test_model_dataframe(self):
        model_nimbusml = Pipeline(
            steps=[('cat', OneHotVectorizer() << categorical_columns),
                   ('linear',
                    FastLinearBinaryClassifier(shuffle=False, train_threads=1)
                    )])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb'))
Пример #24
0
import numpy as np
from nimbusml import Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(word_feature_extractor=Ngram(),
                                vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])

ppl.fit(X_train, y_train)

scores = ppl.predict(X_test)['PredictedLabel']

# evaluate the model