# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    OnlineGradientDescentRegressor(feature=['parity', 'edu'], label='age')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0  28.103731
# 1  21.805904
# 2  28.103731
# 3  25.584600
# 4  33.743286
# print evaluation metrics
示例#2
0
}

if show_individual_predictions:
    r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
    r1.fit(train_df)
    result = r1.predict(test_df)
    print(result)

    r2 = OnlineGradientDescentRegressor(**ogdArgs)
    r2.fit(train_df)
    result = r2.predict(test_df)
    print(result)

    r3 = LightGbmRegressor(**lgbmArgs)
    r3.fit(train_df)
    result = r3.predict(test_df)
    print(result)

# Perform a prediction using an ensemble
# of all three of the above predictors.

r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
r2 = OnlineGradientDescentRegressor(**ogdArgs)
r3 = LightGbmRegressor(**lgbmArgs)
pipeline = Pipeline(
    [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')])

pipeline.fit(train_df)
result = pipeline.predict(test_df)
print(result)
                               sep=',',
                               numeric_dtype=numpy.float32,
                               names={
                                   0: 'row_num',
                                   5: 'case'
                               })
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...
# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    FastForestBinaryClassifier(feature=['age', 'edu', 'induced'], label='case')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel      Score
# 0             0.0 -26.985743
# 1             0.0 -26.562090
# 2             0.0 -24.832508
# 3             0.0 -23.799389
# 4             0.0 -19.612534
# print evaluation metrics
示例#4
0
    def test_example_success(self):

        like = [
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True
        ]
        x1 = [(5. if _ else 4.) for _ in like]
        x2 = [(-5. if _ else -4.) for _ in like]
        x1[0] = 50
        x2[1] = 50
        x2[2] = 50
        train_data = pandas.DataFrame(data=dict(like=like, x1=x2, x2=x2),
                                      dtype=numpy.float32)

        X = train_data.drop('like', axis=1)
        y = train_data[['like']]
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            Role.Feature: ['x1', 'x2'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column == 'like'
        assert transform_2.input == ['x1', 'x2']
        assert transform_2.output == ['Feature']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            "zoo": ['x1', 'x2'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column == 'like'
        assert transform_2.input == ['x1', 'x2']
        assert transform_2.output == ['zoo']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            "zoo": ['x1'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column == 'like'
        assert transform_2.input == ['x1']
        assert transform_2.output == ['zoo']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector(slots_in_output=1,
                                                columns=['x1'],
                                                label='like')
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column == 'like'
        assert transform_2.input == ['x1']
        assert transform_2.output == ['x1']
        pipe = Pipeline([transform_2])
        pipe.fit(train_data)
        res = pipe.transform(train_data)
        assert res is not None
示例#5
0
文件: Image.py 项目: yazici/NimbusML
from nimbusml import Pipeline
from nimbusml.datasets.image import get_RevolutionAnalyticslogo, get_Microsoftlogo
from nimbusml.feature_extraction.image import Loader, Resizer, PixelExtractor
from nimbusml.linear_model import FastLinearBinaryClassifier

data = pandas.DataFrame(
    data=dict(Path=[get_RevolutionAnalyticslogo(),
                    get_Microsoftlogo()],
              Label=[True, False]))

X = data[['Path']]
y = data[['Label']]

# define the training pipeline
pipeline = Pipeline([
    Loader(columns={'ImgPath': 'Path'}),
    Resizer(image_width=32, image_height=32, columns={'ImgResize': 'ImgPath'}),
    PixelExtractor(columns={'ImgPixels': 'ImgResize'}),
    FastLinearBinaryClassifier(feature='ImgPixels')
])

# train
pipeline.fit(X, y)

# predict
scores = pipeline.predict(X)
print("Predicted Labels:", scores.PredictedLabel.values)
# Predicted Labels : [True False]
print("Accuracy:", np.mean(y.Label.values == scores.PredictedLabel.values))
# Accuracy : 1
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LightGbmBinaryClassifier(feature=['induced', 'edu'],
                             label='case',
                             booster=Goss(top_rate=0.9))
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data, 'case').test(data,
                                                       output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel  Probability     Score
# 0               1     0.612220  0.913309
# 1               1     0.612220  0.913309
# 2               0     0.334486 -1.375929
# 3               0     0.334486 -1.375929
# 4               0     0.421264 -0.635176
示例#7
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    KMeansPlusPlus(n_clusters=5, feature=['induced', 'edu', 'parity'])
])

# train, predict, and evaluate
metrics, predictions = pipeline \
    .fit(data) \
    .test(data, 'induced', output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2   Score.3   Score.4
# 0               4  2.732253  2.667988  2.353899  2.339244  0.092014
# 1               4  2.269290  2.120064  2.102576  2.222578  0.300347
# 2               4  3.482253  3.253153  2.425328  2.269245  0.258680
# 3               4  3.130401  2.867317  2.158132  2.055911  0.175347
# 4               2  0.287809  2.172567  0.036439  2.102578  2.050347
示例#8
0
 def test_pipeline_clone_dataframe_transforms(self):
     pipe = Pipeline([
         OneHotVectorizer(columns={'onehot': 'group_2'})
     ])
     fit_transform_clone_and_check(pipe, df)
示例#9
0
 def test_pipeline_clone_filedatastream_transforms(self):
     pipe = Pipeline([
         OneHotVectorizer(columns={'onehot': 'group_2'})
     ])
     fit_transform_clone_and_check(pipe, fds)
示例#10
0
# data input (as a FileDataStream)
path = get_dataset('timeseries').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#      t1    t2      t3
# 0  0.01  0.01  0.0100
# 1  0.02  0.02  0.0200
# 2  0.03  0.03  0.0200
# 3  0.03  0.03  0.0250
# 4  0.03  0.03  0.0005

# define the training pipeline
pipeline = Pipeline([
    SsaChangePointDetector(columns={'t2_cp': 't2'},
                           change_history_length=4,
                           training_window_size=8,
                           seasonal_window_size=3)
])

result = pipeline.fit_transform(data)
print(result)

#      t1     t2       t3  t2_cp.Alert  t2_cp.Raw Score  t2_cp.P-Value Score  t2_cp.Martingale Score
# 0  0.01   0.01   0.0100          0.0        -0.111334         5.000000e-01                0.001213
# 1  0.02   0.02   0.0200          0.0        -0.076755         4.862075e-01                0.001243
# 2  0.03   0.03   0.0200          0.0        -0.034871         3.856320e-03                0.099119
# 3  0.03   0.03   0.0250          0.0        -0.012559         8.617091e-02                0.482400
# 4  0.03   0.03   0.0005          0.0        -0.015723         2.252377e-01                0.988788
# 5  0.03   0.05   0.0100          0.0        -0.001133         1.767711e-01                2.457946
# 6  0.05   0.07   0.0500          0.0         0.006265         9.170460e-02                0.141898
# 7  0.07   0.09   0.0900          0.0         0.002383         2.701134e-01                0.050747
示例#11
0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(
    word_feature_extractor=Ngram(),
    vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])
ppl.fit(X_train, y_train)

# evaluate the model
metrics, scores = ppl.test(X_test, y_test, output_scores=True)

print(metrics)
示例#12
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    SgdBinaryClassifier(feature=['parity', 'edu'], label='case')
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel  Probability     Score
# 0               0     0.363427 -0.560521
# 1               0     0.378848 -0.494439
# 2               0     0.363427 -0.560521
# 3               0     0.369564 -0.534088
# 4               0     0.336350 -0.679603
# print evaluation metrics
print(metrics)
示例#13
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LightGbmClassifier(feature=['parity', 'edu'],
                       label='induced',
                       booster=Dart(reg_lambda=0.1))
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2
# 0               2  0.070722  0.145439  0.783839
# 1               0  0.737733  0.260116  0.002150
# 2               2  0.070722  0.145439  0.783839
# 3               0  0.490715  0.091749  0.417537
# 4               0  0.562419  0.197818  0.239763
# print evaluation metrics
示例#14
0
# LightGbmRanker
import numpy as np
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker

# data input (as a FileDataStream)
path = get_dataset('gen_tickettrain').as_filepath()

# LightGbmRanker requires key type for group column
data = FileDataStream.read_csv(path, dtype={'group': np.uint32})

# define the training pipeline
pipeline = Pipeline([
    LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                   label='rank',
                   group_id='group')
])

# train, predict, and evaluate.
metrics, predictions = pipeline \
    .fit(data) \
    .test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0 -0.124121
# 1 -0.124121
# 2 -0.124121
# 3 -0.376062
示例#15
0
 def test_pass_decision_function_multiclass_with_pipeline(self):
     assert_almost_equal(decfun_sum(Pipeline([NaiveBayesClassifier()])),
                         -96.87325,
                         decimal=4,
                         err_msg=invalid_decision_function_output)
示例#16
0
 def test_pipeline_clone_filedatastream_transforms_shift_operator(self):
     pipe = Pipeline([
         OneHotVectorizer() << {'onehot': 'group_2'}
     ])
     fit_transform_clone_and_check(pipe, fds)
示例#17
0
 def test_fail_decision_function_multiclass_with_pipeline(self):
     check_unsupported_decision_function(
         self, Pipeline([LogisticRegressionClassifier()]), X_train, y_train,
         X_test)
示例#18
0
###############################################################################
# Pipeline
import numpy as np
import pandas as pd
from nimbusml import Pipeline, FileDataStream
from nimbusml.linear_model import FastLinearRegressor
from nimbusml.preprocessing.normalization import MeanVarianceScaler

X = np.array([[1, 2.0], [2, 4], [3, 0.7]])
Y = np.array([2, 3, 1.5])

df = pd.DataFrame(dict(y=Y, x1=X[:, 0], x2=X[:, 1]))

pipe = Pipeline([
    MeanVarianceScaler(),
    FastLinearRegressor()
])

# fit with pandas dataframe
pipe.fit(X, Y)

# Fit with FileDataStream
df.to_csv('data.csv', index=False)
ds = FileDataStream.read_csv('data.csv', sep=',')

pipe = Pipeline([
    MeanVarianceScaler(),
    FastLinearRegressor()
])
pipe.fit(ds, 'y')
print(pipe.summary())
示例#19
0
from nimbusml.linear_model import AveragedPerceptronBinaryClassifier

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity   ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6  ...       1            2  ...
# 1   42     1    0-5yrs        1       1  ...       2            0  ...
# 2   39     1    0-5yrs        2       6  ...       3            0  ...
# 3   34     1    0-5yrs        2       4  ...       4            0  ...
# 4   35     1   6-11yrs        1       3  ...       5            1  ...
# define the training pipeline
pipeline = Pipeline([
    AveragedPerceptronBinaryClassifier(
        feature=['age', 'parity', 'spontaneous'], label='case')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel     Score
# 0               0 -0.285667
# 1               0 -1.304729
# 2               0 -2.651955
# 3               0 -2.111450
# 4               0 -0.660658
# print evaluation metrics
示例#20
0
from nimbusml.timeseries import IidChangePointDetector

# data input (as a FileDataStream)
path = get_dataset('timeseries').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#      t1    t2      t3
# 0  0.01  0.01  0.0100
# 1  0.02  0.02  0.0200
# 2  0.03  0.03  0.0200
# 3  0.03  0.03  0.0250
# 4  0.03  0.03  0.0005

# define the training pipeline
pipeline = Pipeline(
    [IidChangePointDetector(columns={'t2_cp': 't2'}, change_history_length=4)])

result = pipeline.fit_transform(data)
print(result)

#      t1     t2       t3  t2_cp.Alert  t2_cp.Raw Score  t2_cp.P-Value Score  t2_cp.Martingale Score
# 0  0.01   0.01   0.0100          0.0             0.01         5.000000e-01            1.212573e-03
# 1  0.02   0.02   0.0200          0.0             0.02         4.960106e-01            1.221347e-03
# 2  0.03   0.03   0.0200          0.0             0.03         1.139087e-02            3.672914e-02
# 3  0.03   0.03   0.0250          0.0             0.03         2.058296e-01            8.164447e-02
# 4  0.03   0.03   0.0005          0.0             0.03         2.804577e-01            1.373786e-01
# 5  0.03   0.05   0.0100          1.0             0.05         1.448886e-06            1.315014e+04
# 6  0.05   0.07   0.0500          0.0             0.07         2.616611e-03            4.941587e+04
# 7  0.07   0.09   0.0900          0.0             0.09         3.053187e-02            2.752614e+05
# 8  0.09  99.00  99.0000          0.0            99.00         1.000000e-08            1.389396e+12
# 9  1.10   0.10   0.1000          1.0             0.10         3.778296e-01            1.854344e+07
示例#21
0
from nimbusml.feature_extraction.categorical import OneHotVectorizer

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    FastTreesRegressor(feature=['induced', 'edu'], label='age')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0  35.171112
# 1  35.171112
# 2  34.118595
# 3  34.118595
# 4  32.484325
# print evaluation metrics
示例#22
0
from nimbusml.linear_model import LinearSvmBinaryClassifier

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity   ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6  ...       1            2  ...
# 1   42     1    0-5yrs        1       1  ...       2            0  ...
# 2   39     1    0-5yrs        2       6  ...       3            0  ...
# 3   34     1    0-5yrs        2       4  ...       4            0  ...
# 4   35     1   6-11yrs        1       3  ...       5            1  ...
# define the training pipeline
pipeline = Pipeline([
    LinearSvmBinaryClassifier(feature=['age', 'parity', 'spontaneous'],
                              label='case')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#    PredictedLabel     Score  Probability
# 0               1  0.688481     0.607060
# 1               0 -2.514992     0.203312
# 2               0 -3.479344     0.129230
# 3               0 -3.016621     0.161422
# 4               0 -0.825512     0.397461
# print evaluation metrics
示例#23
0
from nimbusml.preprocessing.schema import ColumnConcatenator

data = pd.DataFrame({'month': ['Jan', 'Feb'], 'year': ['1988', '1978']})

# not concatenated
xf = OneHotVectorizer()
features = xf.fit_transform(data)
print(features.head())
#
#   month.Feb  month.Jan  year.1978  year.1988
# 0        0.0        1.0        0.0        1.0
# 1        1.0        0.0        1.0        0.0

# input columns concatenated into vector type
pipe = Pipeline([
    ColumnConcatenator(columns={'f': ['month', 'year']}),
    OneHotVectorizer(columns=['f']),
])
features2 = pipe.fit_transform(data)
print(features2.head())
#   f.month.1978  f.month.1988  f.month.Feb  f.month.Jan  f.year.1978  \
# 0           0.0           0.0          0.0          1.0          0.0
# 1           0.0           0.0          1.0          0.0          1.0
#
#   f.year.1988  f.year.Feb  f.year.Jan month  year
# 0          1.0         0.0         0.0   Jan  1988
# 1          0.0         0.0         0.0   Feb  1978

# input columns concatenated, output_kind = "Bag"
pipe = Pipeline([
    ColumnConcatenator(columns={'f': ['month', 'year']}),
    OneHotVectorizer(columns=['f'], output_kind="Bag"),
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    OrdinaryLeastSquaresRegressor(feature=['parity', 'edu'], label='age')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0  35.188782
# 1  35.363689
# 2  35.188782
# 3  35.258743
# 4  32.818516
示例#25
0
def accuracy(ovr):
    pipe = Pipeline([ovr])
    pipe.fit(X_train, y_train)
    metrics, _ = pipe.test(X_train, y_train)
    return metrics
示例#26
0
 def test_fail_predict_proba_multiclass_with_pipeline(self):
     check_unsupported_predict_proba(self,
                                     Pipeline([NaiveBayesClassifier()]),
                                     X_train, y_train, X_test)
示例#27
0
import pandas as pd
from nimbusml import Pipeline
from nimbusml.ensemble import FastTreesBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \
    OneHotVectorizer
from sklearn.model_selection import GridSearchCV

df = pd.DataFrame(
    dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'],
         workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
         y=[1, 0, 1, 1, 0, 1, 0]))
X = df.drop('y', axis=1)
y = df['y']
pipe = Pipeline([
    ('cat', OneHotVectorizer() << 'education'),
    # unnamed step, stays same in grid search
    OneHotHashVectorizer() << 'workclass',
    # this instance of FastTreesBinaryClassifier with number_of_trees 0 will be
    # never run by grid search as its not a part of param_grid below
    ('learner', FastTreesBinaryClassifier(number_of_trees=0,
                                          number_of_leaves=2))
])

param_grid = dict(cat__output_kind=['Indicator', 'Binary'],
                  learner__number_of_trees=[1, 2, 3])
grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn')

grid.fit(X, y)
print(grid.best_params_)
# {'cat__output_kind': 'Indicator', 'learner__number_of_trees': 1}
示例#28
0
 def test_pass_decision_function_binary_with_pipeline(self):
     assert_almost_equal(decfun_sum(
         Pipeline([FactorizationMachineBinaryClassifier()])),
                         -30.2316,
                         decimal=4,
                         err_msg=invalid_decision_function_output)
示例#29
0
data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    OneVsRestClassifier(
        # using a binary classifier + OVR for multiclass dataset
        FastTreesBinaryClassifier(),
        # True = class probabilities will sum to 1.0
        # False = raw scores, unknown range
        use_probabilities=True,
        feature=['age', 'edu'],
        label='induced')
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2
# 0               2  0.084504  0.302600  0.612897
# 1               0  0.620235  0.379226  0.000538
# 2               2  0.077734  0.061426  0.860840
示例#30
0
# data input (as a FileDataStream)
path = get_dataset('timeseries').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#      t1    t2      t3
# 0  0.01  0.01  0.0100
# 1  0.02  0.02  0.0200
# 2  0.03  0.03  0.0200
# 3  0.03  0.03  0.0250
# 4  0.03  0.03  0.0005

# define the training pipeline
pipeline = Pipeline([
    SsaSpikeDetector(columns={'t2_spikes': 't2'},
                     pvalue_history_length=4,
                     training_window_size=8,
                     seasonal_window_size=3)
])

result = pipeline.fit_transform(data)
print(result)

#      t1     t2       t3  t2_spikes.Alert  t2_spikes.Raw Score  t2_spikes.P-Value Score
# 0  0.01   0.01   0.0100              0.0            -0.111334             5.000000e-01
# 1  0.02   0.02   0.0200              0.0            -0.076755             4.862075e-01
# 2  0.03   0.03   0.0200              0.0            -0.034871             3.856320e-03
# 3  0.03   0.03   0.0250              0.0            -0.012559             8.617091e-02
# 4  0.03   0.03   0.0005              0.0            -0.015723             2.252377e-01
# 5  0.03   0.05   0.0100              0.0            -0.001133             1.767711e-01
# 6  0.05   0.07   0.0500              0.0             0.006265             9.170460e-02
# 7  0.07   0.09   0.0900              0.0             0.002383             2.701134e-01