Пример #1
0
    def test_lightgbmranker_asfilestream(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        train_stream = FileDataStream.read_csv(file_path, encoding='utf-8')

        # pipeline
        pipeline = Pipeline([
            # the group_id column must be of key type
            ToKey(columns={'rank': 'rank', 'group': 'group'}),
            LightGbmRanker(
                feature=[
                    'Class',
                    'dep_day',
                    'duration'],
                label='rank',
                group_id='group')
        ])

        # train
        pipeline.fit(train_stream)

        # test
        eval_stream = FileDataStream.read_csv(file_path)
        metrics, _ = pipeline.test(eval_stream)
        assert_almost_equal(
            metrics['NDCG@1'][0],
            0.43571429,
            decimal=7,
            err_msg="NDCG@1 should be %s" %
                    0.43571429)
        assert_almost_equal(
            metrics['NDCG@2'][0],
            0.5128226,
            decimal=7,
            err_msg="NDCG@2 should be %s" %
                    0.5128226)
        assert_almost_equal(
            metrics['NDCG@3'][0],
            0.55168069,
            decimal=7,
            err_msg="NDCG@3 should be %s" %
                    0.55168069)
        assert_almost_equal(
            metrics['DCG@1'][0],
            4.688759,
            decimal=3,
            err_msg="DCG@1 should be %s" %
                    4.688759)
        assert_almost_equal(
            metrics['DCG@2'][0],
            9.012395,
            decimal=3,
            err_msg="DCG@2 should be %s" %
                    9.012395)
        assert_almost_equal(
            metrics['DCG@3'][0],
            11.446943,
            decimal=3,
            err_msg="DCG@3 should be %s" %
                    11.446943)
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
import unittest

import pandas
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer

path = get_dataset("wiki_detox_train").as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
df = data.to_df().head()
X = df['SentimentText']


class TestPipelineTransformMethod(unittest.TestCase):
    def test_transform_only_pipeline_transform_method(self):
        p = Pipeline(
            [NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
        p.fit(X)
        xf = p.transform(X)
        assert 'SentimentText.==rude==' in xf.columns


if __name__ == '__main__':
    unittest.main()
Пример #3
0
###############################################################################
# CV - cross-validate data
import numpy as np
from nimbusml import Pipeline, FileDataStream, DataSchema
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionClassifier, \
    FastLinearRegressor
from nimbusml.model_selection import CV
from nimbusml.preprocessing.missing_values import Indicator, Handler

# Case 1: Default usage of CV

path = get_dataset('infert').as_filepath()
schema = DataSchema.read_schema(path, numeric_dtype=np.float32)
data = FileDataStream.read_csv(path, schema=schema)

pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'],
                                 label='induced')
])

# Do 3-fold cross-validation
cv_results = CV(pipeline).fit(data, cv=3)

# print summary statistic of metrics
print(cv_results['metrics_summary'])

# print metrics for all folds
print(cv_results['metrics'])
Пример #4
0
###############################################################################
# LightLda
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer, LightLda
from nimbusml.feature_extraction.text.extractor import Ngram

# data input as a FileDataStream
path = get_dataset('topics').as_filepath()
data = FileDataStream.read_csv(path, sep=",")
print(data.head())
#                               review                    review_reverse  label
# 0  animals birds cats dogs fish horse   radiation galaxy universe duck      1
# 1    horse birds house fish duck cats  space galaxy universe radiation      0
# 2         car truck driver bus pickup                       bus pickup      1

# transform usage
pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    vector_normalizer='None',
                    columns=['review']),
    LightLda(num_topic=3, columns=['review'])
])

# fit and transform
features = pipeline.fit_transform(data)
print(features.head())
#   label  review.0  review.1  review.2                     review_reverse
# 0      1  0.500000  0.333333  0.166667     radiation galaxy universe duck
# 1      0  0.000000  0.166667  0.833333    space galaxy universe radiation
# 2      1  0.400000  0.200000  0.400000                         bus pickup
Пример #5
0
###############################################################################
# NGramFeaturizer
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.feature_extraction.text.stopwords import CustomStopWordsRemover

# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

xf = NGramFeaturizer(word_feature_extractor=Ngram(),
                     stop_words_remover=CustomStopWordsRemover(['!',
                                                                '$',
                                                                '%',
                                                                '&',
                                                                '\'',
                                                                '\'d']),
                     columns={'features': ['SentimentText']})

# fit and transform
features = xf.fit_transform(data)
Пример #6
0
###############################################################################
# ColumnSelector
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnSelector

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',')

# transform usage
xf = ColumnSelector(columns=['education', 'age'])

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#   age education
# 0   26    0-5yrs
# 1   42    0-5yrs
# 2   39    0-5yrs
# 3   34    0-5yrs
# 4   35   6-11yrs
Пример #7
0
###############################################################################
# SsaForecaster
import pandas as pd
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.timeseries import SsaForecaster

# data input (as a FileDataStream)
path = get_dataset('timeseries').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#      t1    t2      t3
# 0  0.01  0.01  0.0100
# 1  0.02  0.02  0.0200
# 2  0.03  0.03  0.0200
# 3  0.03  0.03  0.0250
# 4  0.03  0.03  0.0005

# define the training pipeline
pipeline = Pipeline([
    SsaForecaster(series_length=6,
                  train_size=8,
                  window_size=3,
                  horizon=2,
                  columns={'t2_fc': 't2'})
])

result = pipeline.fit_transform(data)

pd.set_option('display.float_format', lambda x: '%.2f' % x)
Пример #8
0
import pandas as pd
import six
from nimbusml import Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastTreesBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \
    OneHotVectorizer
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text import WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.linear_model import FastLinearBinaryClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import GridSearchCV
from sklearn.utils.testing import assert_raises

train_file = get_dataset('uciadult_train').as_filepath()
test_file = get_dataset('uciadult_test').as_filepath()
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
label_column = 'label'


class TestSweep(unittest.TestCase):
    def test_hyperparameters_sweep(self):
        # general test with combination of named and unnamed steps
        np.random.seed(0)
        df = pd.DataFrame(
            dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'],
                 workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
Пример #9
0
# --------------------------------------------------------------------------------------------
import unittest

import numpy as np
import pandas as pd
from nimbusml import FileDataStream
from nimbusml import Pipeline, Role
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import FastLinearRegressor
from nimbusml.preprocessing import ToKey
from nimbusml.utils.exports import dot_export_pipeline

# set up dataframe
path = get_dataset("gen_tickettrain").as_filepath()
df = pd.read_csv(path)
df = df.rename(index=str, columns={'rank': 'label_1', 'group': 'group_2'})
features = ['price', 'Class', 'duration']
df['group_2'] = df['group_2'].astype(np.uint32)
X = df.drop(['label_1'], axis=1)
y = df['label_1']

# set up filedatastream
fds = FileDataStream.read_csv(path, names={0: 'label_1', 1: 'group_2'})


def clone_and_check(pipe):
    pipe_attrs = pipe.__dict__.copy()
    cloned_pipe = pipe.clone()
    cloned_attrs = cloned_pipe.__dict__.copy()
Пример #10
0
    def test_syntax_slots_wo_pipeline(self):
        # data
        df = get_dataset("infert").as_df()
        df = df.drop(['row_num', ], axis=1)
        X = df.drop('case', axis=1)
        y = df['case']

        # transform
        xf1 = OneHotVectorizer(columns=['age', 'parity', 'education_str'])
        X_xf1 = xf1.fit_transform(X, verbose=0)
        assert "age.21" in list(X_xf1.columns)

        # learner
        # (1 .a.)
        model = AveragedPerceptronBinaryClassifier()

        # (1. b)
        try:
            model = AveragedPerceptronBinaryClassifier(feature=['age'])
            model.fit(X_xf1, y, verbose=0)
            cont = True
            assert False
        except Exception as e:
            # does not work
            cont = False
            print(e)

        if cont:
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)

        pipeline = Pipeline([
            OneHotVectorizer(columns=['age', 'parity', 'education_str']),
            AveragedPerceptronBinaryClassifier(feature='age')
        ])

        pipeline.fit(X, y, verbose=0)

        y_pred_withpipeline = pipeline.predict(X)
        print(y_pred_withpipeline.head())
        assert y_pred_withpipeline.shape == (248, 3)

        metrics, scores = pipeline.test(X, y, output_scores=True)
        print(metrics)
        assert scores.shape == (248, 3)
        assert metrics.shape == (1, 11)

        # back to X_xf1
        print(list(X_xf1.columns))
        l1 = list(sorted(set(_.split('.')[-1] for _ in X_xf1.columns)))
        levels = [['age', 'education', 'education_str', 'parity',
                   'pooled', 'spontaneous', 'stratum', 'induced'], [''] + l1]
        names = ['columns', 'slots']
        labels = [[], []]
        ages = []
        for _ in X_xf1.columns:
            spl = _.split('.')
            l1 = levels[0].index(spl[0])
            try:
                l2 = levels[1].index(spl[1])
            except IndexError:
                l2 = levels[1].index('')
            labels[0].append(l1)
            labels[1].append(l2)
            if spl[0] == 'age':
                ages.append(l2)
        X_xf1.columns = pandas.MultiIndex(
            levels=levels, labels=labels, names=names)
        print(X_xf1.head(n=2).T)

        col_ages = [('age', a) for a in ages]
        print(col_ages)
        try:
            model = AveragedPerceptronBinaryClassifier(feature=col_ages)
            model.fit(X_xf1, y, verbose=0)
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)
        except Exception as e:
            # Does not work, probably confusion between list and tuple in nimbusml
            print(e)

        try:
            model = AveragedPerceptronBinaryClassifier(feature=['age'])
            model.fit(X_xf1, y, verbose=0)
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)
        except Exception as e:
            # Does not work.
            print(e)
Пример #11
0
 def setUpClass(cls):
     df = get_dataset("infert").as_df()
     # remove : and ' ' from column names, and encode categorical column
     df.columns = [i.replace(': ', '') for i in df.columns]
     df = (OneHotVectorizer() << ['education_str']).fit_transform(df)
     cls.X, cls.y = split_features_and_label(df, 'case')
Пример #12
0
 def setUpClass(cls):
     df = get_dataset("iris").as_df()
     df.drop(['Species'], inplace=True, axis=1)
     cls.X, cls.y = split_features_and_label(df, 'Label')
Пример #13
0
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastTreesBinaryClassifier
from sklearn.model_selection import train_test_split

# Use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()[['stratum', 'parity', 'case']]

# remove : and ' ' from column names
df.columns = [i.replace(': ', '') for i in df.columns]

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'case'], df['case'])

# train model and see accuracy
ftree = FastTreesBinaryClassifier().fit(X_train, y_train)
scores = ftree.predict(X_test)
print('Accuracy1:', np.mean(y_test == [i for i in scores]))

# Unpickle model and score. We should get the exact same accuracy as above
s = pickle.dumps(ftree)
ftree2 = pickle.loads(s)
###############################################################################
# OrdinaryLeastSquaresRegressor
from nimbusml import Pipeline, FileDataStream, Role
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import OrdinaryLeastSquaresRegressor
from nimbusml.preprocessing.missing_values import Filter

# use the built-in data set 'airquality' to create test and train data
#    Unnamed: 0  Ozone  Solar_R  Wind  Temp  Month  Day
# 0           1   41.0    190.0   7.4    67      5    1
# 1           2   36.0    118.0   8.0    72      5    2

train_file = get_dataset("airquality").as_filepath()
schema = "col=none:R4:0 col=ozone:R4:1 col=solar:R4:2 col=wind:R4:3 " \
         "col=temp:R4:4 col=month:R4:5 col=day:R4:6 sep=, header=+"

fds = FileDataStream(train_file, schema=schema)

# set up pipeline
pipe = Pipeline([
    Filter() << ['ozone'],
    OrdinaryLeastSquaresRegressor() << {
        Role.Label: 'ozone',
        Role.Feature: ['solar', 'wind', 'temp', 'month', 'day']
    }
])

# train and evaluate the model
metrics, scores = pipe.fit(fds).test(fds, "ozone", output_scores=True)
print(metrics)
Пример #15
0
# FastForestBinaryClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastForestBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from sklearn.model_selection import train_test_split

# use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()

# remove : and ' ' from column names, and encode categorical column
df.columns = [i.replace(': ', '') for i in df.columns]
df = (OneHotVectorizer() << 'education_str').fit_transform(df)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'case'], df['case'])

fforest = FastForestBinaryClassifier().fit(X_train, y_train)
scores = fforest.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Пример #16
0
# Pipeline with observation level feature contributions

# Scoring a dataset with a trained model produces a score, or prediction, for
# each example. To understand and explain these predictions it can be useful to
# inspect which features influenced them most significantly. This function
# computes a model-specific list of per-feature contributions to the score for
# each example. These contributions can be positive (they make the score
# higher) or negative (they make the score lower).

from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastTreesBinaryClassifier
from nimbusml.linear_model import LogisticRegressionBinaryClassifier

# data input (as a FileDataStream)
path = get_dataset('uciadult_train').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    label  workclass     education  ... capital-loss hours-per-week
# 0      0    Private          11th  ...            0             40
# 1      0    Private       HS-grad  ...            0             50
# 2      1  Local-gov    Assoc-acdm  ...            0             40
# 3      1    Private  Some-college  ...            0             40
# 4      0          ?  Some-college  ...            0             30

# define the training pipeline with a linear model
lr_pipeline = Pipeline([LogisticRegressionBinaryClassifier(
    feature=['age', 'education-num', 'hours-per-week'], label='label')])

# train the model
Пример #17
0
 def data_pandas(self):
     simpleinput_file = get_dataset("gen_tickettrain").as_filepath()
     data = pd.read_csv(simpleinput_file)
     data['group'] = data['group'].astype(str)
     return data
Пример #18
0
# PcaTransformer
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.decomposition import PcaTransformer
from nimbusml.ensemble import LightGbmBinaryClassifier
from nimbusml.preprocessing.schema import ColumnConcatenator

# use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs

train_file = get_dataset("infert").as_filepath()
schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \
         "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \
         "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \
         "sep=, header=+"
fds = FileDataStream(train_file, schema=schema)

# target and features columns
y = 'case'
X = ['age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooledstratum']

# observe gradual impact of dimensionality reduction on AUC
# reducing dimensions should degrade signal gradually, while
# maintaining the traits in original dataset as much as possible.
for rank in range(len(X), 2, -1):
    print('Number of dimensions=', rank)
Пример #19
0
###############################################################################
# LightGbmRanker
import numpy as np
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker

# data input (as a FileDataStream)
path = get_dataset('gen_tickettrain').as_filepath()

# LightGbmRanker requires key type for group column
data = FileDataStream.read_csv(path, dtype={'group': np.uint32})

# define the training pipeline
pipeline = Pipeline([
    LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                   label='rank',
                   group_id='group')
])

# train, predict, and evaluate.
# TODO: Replace with CV
metrics, predictions = pipeline \
    .fit(data) \
    .test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0 -0.124121
# 1 -0.124121
Пример #20
0
from nimbusml.linear_model import AveragedPerceptronBinaryClassifier
from nimbusml.linear_model import FastLinearBinaryClassifier
from nimbusml.linear_model import FastLinearClassifier
from nimbusml.linear_model import FastLinearRegressor
from nimbusml.linear_model import LogisticRegressionBinaryClassifier
from nimbusml.linear_model import LogisticRegressionClassifier
from nimbusml.linear_model import OnlineGradientDescentRegressor
from nimbusml.linear_model import OrdinaryLeastSquaresRegressor
from nimbusml.linear_model import PoissonRegressionRegressor
from nimbusml.linear_model import SgdBinaryClassifier
# from nimbusml.linear_model import SymSgdBinaryClassifier
from nimbusml.multiclass import OneVsRestClassifier
from nimbusml.naive_bayes import NaiveBayesClassifier
from sklearn.utils.testing import assert_raises

train_file = get_dataset("uciadult_train").as_filepath()
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \
              'col=education:TX:2 col=marital-status:TX:3 ' \
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import EnsembleRegressor
from nimbusml.ensemble.feature_selector import RandomFeatureSelector
from nimbusml.ensemble.output_combiner import RegressorMedian
from nimbusml.ensemble.subset_selector import RandomPartitionSelector
from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# use the built-in data set 'airquality' to create test and train data
#    Unnamed: 0  Ozone  Solar_R  Wind  Temp  Month  Day
# 0           1   41.0    190.0   7.4    67      5    1
# 1           2   36.0    118.0   8.0    72      5    2
np.random.seed(0)

df = get_dataset("airquality").as_df().fillna(0)
df = df[df.Ozone.notnull()]

X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, df.columns != 'Ozone'], df['Ozone'])

# train a model with default sampling and ensembling parameters and score
ensemble_with_defaults = EnsembleRegressor(num_models=3).fit(X_train, y_train)
scores = ensemble_with_defaults.predict(X_test)

# evaluate the model
print('R-squared fit:', r2_score(
    y_test,
    scores,
))
# R-squared fit: 0.12144964995862884
###############################################################################
# LightGbmClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmClassifier
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)

df = get_dataset("iris").as_df()
df.drop(['Species'], inplace=True, axis=1)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])
lr = LightGbmClassifier().fit(X_train, y_train)

scores = lr.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Пример #23
0
def infert_df(label_name):
    df = get_dataset('infert').as_df()
    df = (OneHotVectorizer() << 'education_str').fit_transform(df)
    X, y = split_features_and_label(df, label_name)
    return X, y
Пример #24
0
SHOW_ONNX_JSON = False
SHOW_TRANSFORMED_RESULTS = False
SHOW_FULL_PANDAS_OUTPUT = False

if SHOW_FULL_PANDAS_OUTPUT:
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', 10000)

script_path = os.path.realpath(__file__)
script_dir = os.path.dirname(script_path)

#      Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label    Species  Setosa
# 0             5.1          3.5           1.4          0.2     0     setosa     1.0
# 1             4.9          3.0           1.4          0.2     0     setosa     1.0
iris_df = get_dataset("iris").as_df()
iris_df.drop(['Species'], axis=1, inplace=True)

iris_with_nan_df = iris_df.copy()
iris_with_nan_df.loc[1, 'Petal_Length'] = np.nan

iris_no_label_df = iris_df.drop(['Label'], axis=1)
iris_binary_df = iris_no_label_df.rename(columns={'Setosa': 'Label'})
iris_regression_df = iris_no_label_df.drop(
    ['Setosa'], axis=1).rename(columns={'Petal_Width': 'Label'})

#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  pooled.stratum education_str
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0     3.0        0-5yrs
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0     1.0        0-5yrs
infert_df = get_dataset("infert").as_df()
infert_df.columns = [i.replace(': ', '') for i in infert_df.columns]
# permutation analysis across allthe features of a model, one after another.

# PFI is supported for binary classifiers, classifiers, regressors, and
# rankers.

from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionBinaryClassifier, \
    FastLinearClassifier, FastLinearRegressor
from nimbusml.preprocessing import ToKey
from numpy.testing import assert_almost_equal

# data input (as a FileDataStream)
adult_path = get_dataset('uciadult_train').as_filepath()
classification_data = FileDataStream.read_csv(adult_path)
print(classification_data.head())
#    label  workclass     education  ... capital-loss hours-per-week
# 0      0    Private          11th  ...            0             40
# 1      0    Private       HS-grad  ...            0             50
# 2      1  Local-gov    Assoc-acdm  ...            0             40
# 3      1    Private  Some-college  ...            0             40
# 4      0          ?  Some-college  ...            0             30

######################################
# PFI for Binary Classification models
######################################
# define the training pipeline with a binary classifier
binary_pipeline = Pipeline([
    OneHotVectorizer(columns=['education']),