# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), OnlineGradientDescentRegressor(feature=['parity', 'edu'], label='age') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 28.103731 # 1 21.805904 # 2 28.103731 # 3 25.584600 # 4 33.743286 # print evaluation metrics
} if show_individual_predictions: r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result = r1.predict(test_df) print(result) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result = r2.predict(test_df) print(result) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result = r3.predict(test_df) print(result) # Perform a prediction using an ensemble # of all three of the above predictors. r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline( [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]) pipeline.fit(train_df) result = pipeline.predict(test_df) print(result)
sep=',', numeric_dtype=numpy.float32, names={ 0: 'row_num', 5: 'case' }) print(data.head()) # age case education induced parity pooled.stratum row_num ... # 0 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... # 1 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... # 2 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... # 3 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), FastForestBinaryClassifier(feature=['age', 'edu', 'induced'], label='case') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score # 0 0.0 -26.985743 # 1 0.0 -26.562090 # 2 0.0 -24.832508 # 3 0.0 -23.799389 # 4 0.0 -19.612534 # print evaluation metrics
def test_example_success(self): like = [ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ] x1 = [(5. if _ else 4.) for _ in like] x2 = [(-5. if _ else -4.) for _ in like] x1[0] = 50 x2[1] = 50 x2[2] = 50 train_data = pandas.DataFrame(data=dict(like=like, x1=x2, x2=x2), dtype=numpy.float32) X = train_data.drop('like', axis=1) y = train_data[['like']] transform_2 = MutualInformationSelector() exp = Pipeline([transform_2]) res = exp.fit_transform(X, y) assert res is not None transform_2 = MutualInformationSelector(slots_in_output=2) pipe = Pipeline([transform_2]) res = pipe.fit_transform(X, y) assert res is not None transform_2 = MutualInformationSelector() << { Role.Feature: ['x1', 'x2'], Role.Label: 'like' } assert transform_2._allowed_roles == {'Label'} assert transform_2.label_column == 'like' assert transform_2.input == ['x1', 'x2'] assert transform_2.output == ['Feature'] exp = Pipeline([transform_2]) res = exp.fit_transform(train_data) assert res is not None transform_2 = MutualInformationSelector() << { "zoo": ['x1', 'x2'], Role.Label: 'like' } assert transform_2._allowed_roles == {'Label'} assert transform_2.label_column == 'like' assert transform_2.input == ['x1', 'x2'] assert transform_2.output == ['zoo'] exp = Pipeline([transform_2]) res = exp.fit_transform(train_data) assert res is not None transform_2 = MutualInformationSelector() << { "zoo": ['x1'], Role.Label: 'like' } assert transform_2._allowed_roles == {'Label'} assert transform_2.label_column == 'like' assert transform_2.input == ['x1'] assert transform_2.output == ['zoo'] exp = Pipeline([transform_2]) res = exp.fit_transform(train_data) assert res is not None transform_2 = MutualInformationSelector(slots_in_output=1, columns=['x1'], label='like') assert transform_2._allowed_roles == {'Label'} assert transform_2.label_column == 'like' assert transform_2.input == ['x1'] assert transform_2.output == ['x1'] pipe = Pipeline([transform_2]) pipe.fit(train_data) res = pipe.transform(train_data) assert res is not None
from nimbusml import Pipeline from nimbusml.datasets.image import get_RevolutionAnalyticslogo, get_Microsoftlogo from nimbusml.feature_extraction.image import Loader, Resizer, PixelExtractor from nimbusml.linear_model import FastLinearBinaryClassifier data = pandas.DataFrame( data=dict(Path=[get_RevolutionAnalyticslogo(), get_Microsoftlogo()], Label=[True, False])) X = data[['Path']] y = data[['Label']] # define the training pipeline pipeline = Pipeline([ Loader(columns={'ImgPath': 'Path'}), Resizer(image_width=32, image_height=32, columns={'ImgResize': 'ImgPath'}), PixelExtractor(columns={'ImgPixels': 'ImgResize'}), FastLinearBinaryClassifier(feature='ImgPixels') ]) # train pipeline.fit(X, y) # predict scores = pipeline.predict(X) print("Predicted Labels:", scores.PredictedLabel.values) # Predicted Labels : [True False] print("Accuracy:", np.mean(y.Label.values == scores.PredictedLabel.values)) # Accuracy : 1
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LightGbmBinaryClassifier(feature=['induced', 'edu'], label='case', booster=Goss(top_rate=0.9)) ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data, 'case').test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Probability Score # 0 1 0.612220 0.913309 # 1 1 0.612220 0.913309 # 2 0 0.334486 -1.375929 # 3 0 0.334486 -1.375929 # 4 0 0.421264 -0.635176
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), KMeansPlusPlus(n_clusters=5, feature=['induced', 'edu', 'parity']) ]) # train, predict, and evaluate metrics, predictions = pipeline \ .fit(data) \ .test(data, 'induced', output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 Score.3 Score.4 # 0 4 2.732253 2.667988 2.353899 2.339244 0.092014 # 1 4 2.269290 2.120064 2.102576 2.222578 0.300347 # 2 4 3.482253 3.253153 2.425328 2.269245 0.258680 # 3 4 3.130401 2.867317 2.158132 2.055911 0.175347 # 4 2 0.287809 2.172567 0.036439 2.102578 2.050347
def test_pipeline_clone_dataframe_transforms(self): pipe = Pipeline([ OneHotVectorizer(columns={'onehot': 'group_2'}) ]) fit_transform_clone_and_check(pipe, df)
def test_pipeline_clone_filedatastream_transforms(self): pipe = Pipeline([ OneHotVectorizer(columns={'onehot': 'group_2'}) ]) fit_transform_clone_and_check(pipe, fds)
# data input (as a FileDataStream) path = get_dataset('timeseries').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # t1 t2 t3 # 0 0.01 0.01 0.0100 # 1 0.02 0.02 0.0200 # 2 0.03 0.03 0.0200 # 3 0.03 0.03 0.0250 # 4 0.03 0.03 0.0005 # define the training pipeline pipeline = Pipeline([ SsaChangePointDetector(columns={'t2_cp': 't2'}, change_history_length=4, training_window_size=8, seasonal_window_size=3) ]) result = pipeline.fit_transform(data) print(result) # t1 t2 t3 t2_cp.Alert t2_cp.Raw Score t2_cp.P-Value Score t2_cp.Martingale Score # 0 0.01 0.01 0.0100 0.0 -0.111334 5.000000e-01 0.001213 # 1 0.02 0.02 0.0200 0.0 -0.076755 4.862075e-01 0.001243 # 2 0.03 0.03 0.0200 0.0 -0.034871 3.856320e-03 0.099119 # 3 0.03 0.03 0.0250 0.0 -0.012559 8.617091e-02 0.482400 # 4 0.03 0.03 0.0005 0.0 -0.015723 2.252377e-01 0.988788 # 5 0.03 0.05 0.0100 0.0 -0.001133 1.767711e-01 2.457946 # 6 0.05 0.07 0.0500 0.0 0.006265 9.170460e-02 0.141898 # 7 0.07 0.09 0.0900 0.0 0.002383 2.701134e-01 0.050747
from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.naive_bayes import NaiveBayesClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import train_test_split # use 'wiki_detox_train' data set to create test and train data # Sentiment SentimentText # 1 ==RUDE== Dude, you are rude upload that carl picture back, or else. # 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split(train, label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=Ngram(), vector_normalizer='None') << 'SentimentText' nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) ppl.fit(X_train, y_train) # evaluate the model metrics, scores = ppl.test(X_test, y_test, output_scores=True) print(metrics)
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), SgdBinaryClassifier(feature=['parity', 'edu'], label='case') ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Probability Score # 0 0 0.363427 -0.560521 # 1 0 0.378848 -0.494439 # 2 0 0.363427 -0.560521 # 3 0 0.369564 -0.534088 # 4 0 0.336350 -0.679603 # print evaluation metrics print(metrics)
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LightGbmClassifier(feature=['parity', 'edu'], label='induced', booster=Dart(reg_lambda=0.1)) ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 0.070722 0.145439 0.783839 # 1 0 0.737733 0.260116 0.002150 # 2 2 0.070722 0.145439 0.783839 # 3 0 0.490715 0.091749 0.417537 # 4 0 0.562419 0.197818 0.239763 # print evaluation metrics
# LightGbmRanker import numpy as np from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmRanker # data input (as a FileDataStream) path = get_dataset('gen_tickettrain').as_filepath() # LightGbmRanker requires key type for group column data = FileDataStream.read_csv(path, dtype={'group': np.uint32}) # define the training pipeline pipeline = Pipeline([ LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train, predict, and evaluate. metrics, predictions = pipeline \ .fit(data) \ .test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 -0.124121 # 1 -0.124121 # 2 -0.124121 # 3 -0.376062
def test_pass_decision_function_multiclass_with_pipeline(self): assert_almost_equal(decfun_sum(Pipeline([NaiveBayesClassifier()])), -96.87325, decimal=4, err_msg=invalid_decision_function_output)
def test_pipeline_clone_filedatastream_transforms_shift_operator(self): pipe = Pipeline([ OneHotVectorizer() << {'onehot': 'group_2'} ]) fit_transform_clone_and_check(pipe, fds)
def test_fail_decision_function_multiclass_with_pipeline(self): check_unsupported_decision_function( self, Pipeline([LogisticRegressionClassifier()]), X_train, y_train, X_test)
############################################################################### # Pipeline import numpy as np import pandas as pd from nimbusml import Pipeline, FileDataStream from nimbusml.linear_model import FastLinearRegressor from nimbusml.preprocessing.normalization import MeanVarianceScaler X = np.array([[1, 2.0], [2, 4], [3, 0.7]]) Y = np.array([2, 3, 1.5]) df = pd.DataFrame(dict(y=Y, x1=X[:, 0], x2=X[:, 1])) pipe = Pipeline([ MeanVarianceScaler(), FastLinearRegressor() ]) # fit with pandas dataframe pipe.fit(X, Y) # Fit with FileDataStream df.to_csv('data.csv', index=False) ds = FileDataStream.read_csv('data.csv', sep=',') pipe = Pipeline([ MeanVarianceScaler(), FastLinearRegressor() ]) pipe.fit(ds, 'y') print(pipe.summary())
from nimbusml.linear_model import AveragedPerceptronBinaryClassifier # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ AveragedPerceptronBinaryClassifier( feature=['age', 'parity', 'spontaneous'], label='case') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score # 0 0 -0.285667 # 1 0 -1.304729 # 2 0 -2.651955 # 3 0 -2.111450 # 4 0 -0.660658 # print evaluation metrics
from nimbusml.timeseries import IidChangePointDetector # data input (as a FileDataStream) path = get_dataset('timeseries').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # t1 t2 t3 # 0 0.01 0.01 0.0100 # 1 0.02 0.02 0.0200 # 2 0.03 0.03 0.0200 # 3 0.03 0.03 0.0250 # 4 0.03 0.03 0.0005 # define the training pipeline pipeline = Pipeline( [IidChangePointDetector(columns={'t2_cp': 't2'}, change_history_length=4)]) result = pipeline.fit_transform(data) print(result) # t1 t2 t3 t2_cp.Alert t2_cp.Raw Score t2_cp.P-Value Score t2_cp.Martingale Score # 0 0.01 0.01 0.0100 0.0 0.01 5.000000e-01 1.212573e-03 # 1 0.02 0.02 0.0200 0.0 0.02 4.960106e-01 1.221347e-03 # 2 0.03 0.03 0.0200 0.0 0.03 1.139087e-02 3.672914e-02 # 3 0.03 0.03 0.0250 0.0 0.03 2.058296e-01 8.164447e-02 # 4 0.03 0.03 0.0005 0.0 0.03 2.804577e-01 1.373786e-01 # 5 0.03 0.05 0.0100 1.0 0.05 1.448886e-06 1.315014e+04 # 6 0.05 0.07 0.0500 0.0 0.07 2.616611e-03 4.941587e+04 # 7 0.07 0.09 0.0900 0.0 0.09 3.053187e-02 2.752614e+05 # 8 0.09 99.00 99.0000 0.0 99.00 1.000000e-08 1.389396e+12 # 9 1.10 0.10 0.1000 1.0 0.10 3.778296e-01 1.854344e+07
from nimbusml.feature_extraction.categorical import OneHotVectorizer # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), FastTreesRegressor(feature=['induced', 'edu'], label='age') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 35.171112 # 1 35.171112 # 2 34.118595 # 3 34.118595 # 4 32.484325 # print evaluation metrics
from nimbusml.linear_model import LinearSvmBinaryClassifier # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ LinearSvmBinaryClassifier(feature=['age', 'parity', 'spontaneous'], label='case') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score Probability # 0 1 0.688481 0.607060 # 1 0 -2.514992 0.203312 # 2 0 -3.479344 0.129230 # 3 0 -3.016621 0.161422 # 4 0 -0.825512 0.397461 # print evaluation metrics
from nimbusml.preprocessing.schema import ColumnConcatenator data = pd.DataFrame({'month': ['Jan', 'Feb'], 'year': ['1988', '1978']}) # not concatenated xf = OneHotVectorizer() features = xf.fit_transform(data) print(features.head()) # # month.Feb month.Jan year.1978 year.1988 # 0 0.0 1.0 0.0 1.0 # 1 1.0 0.0 1.0 0.0 # input columns concatenated into vector type pipe = Pipeline([ ColumnConcatenator(columns={'f': ['month', 'year']}), OneHotVectorizer(columns=['f']), ]) features2 = pipe.fit_transform(data) print(features2.head()) # f.month.1978 f.month.1988 f.month.Feb f.month.Jan f.year.1978 \ # 0 0.0 0.0 0.0 1.0 0.0 # 1 0.0 0.0 1.0 0.0 1.0 # # f.year.1988 f.year.Feb f.year.Jan month year # 0 1.0 0.0 0.0 Jan 1988 # 1 0.0 0.0 0.0 Feb 1978 # input columns concatenated, output_kind = "Bag" pipe = Pipeline([ ColumnConcatenator(columns={'f': ['month', 'year']}), OneHotVectorizer(columns=['f'], output_kind="Bag"),
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), OrdinaryLeastSquaresRegressor(feature=['parity', 'edu'], label='age') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 35.188782 # 1 35.363689 # 2 35.188782 # 3 35.258743 # 4 32.818516
def accuracy(ovr): pipe = Pipeline([ovr]) pipe.fit(X_train, y_train) metrics, _ = pipe.test(X_train, y_train) return metrics
def test_fail_predict_proba_multiclass_with_pipeline(self): check_unsupported_predict_proba(self, Pipeline([NaiveBayesClassifier()]), X_train, y_train, X_test)
import pandas as pd from nimbusml import Pipeline from nimbusml.ensemble import FastTreesBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \ OneHotVectorizer from sklearn.model_selection import GridSearchCV df = pd.DataFrame( dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', # this instance of FastTreesBinaryClassifier with number_of_trees 0 will be # never run by grid search as its not a part of param_grid below ('learner', FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2)) ]) param_grid = dict(cat__output_kind=['Indicator', 'Binary'], learner__number_of_trees=[1, 2, 3]) grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn') grid.fit(X, y) print(grid.best_params_) # {'cat__output_kind': 'Indicator', 'learner__number_of_trees': 1}
def test_pass_decision_function_binary_with_pipeline(self): assert_almost_equal(decfun_sum( Pipeline([FactorizationMachineBinaryClassifier()])), -30.2316, decimal=4, err_msg=invalid_decision_function_output)
data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), OneVsRestClassifier( # using a binary classifier + OVR for multiclass dataset FastTreesBinaryClassifier(), # True = class probabilities will sum to 1.0 # False = raw scores, unknown range use_probabilities=True, feature=['age', 'edu'], label='induced') ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 0.084504 0.302600 0.612897 # 1 0 0.620235 0.379226 0.000538 # 2 2 0.077734 0.061426 0.860840
# data input (as a FileDataStream) path = get_dataset('timeseries').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # t1 t2 t3 # 0 0.01 0.01 0.0100 # 1 0.02 0.02 0.0200 # 2 0.03 0.03 0.0200 # 3 0.03 0.03 0.0250 # 4 0.03 0.03 0.0005 # define the training pipeline pipeline = Pipeline([ SsaSpikeDetector(columns={'t2_spikes': 't2'}, pvalue_history_length=4, training_window_size=8, seasonal_window_size=3) ]) result = pipeline.fit_transform(data) print(result) # t1 t2 t3 t2_spikes.Alert t2_spikes.Raw Score t2_spikes.P-Value Score # 0 0.01 0.01 0.0100 0.0 -0.111334 5.000000e-01 # 1 0.02 0.02 0.0200 0.0 -0.076755 4.862075e-01 # 2 0.03 0.03 0.0200 0.0 -0.034871 3.856320e-03 # 3 0.03 0.03 0.0250 0.0 -0.012559 8.617091e-02 # 4 0.03 0.03 0.0005 0.0 -0.015723 2.252377e-01 # 5 0.03 0.05 0.0100 0.0 -0.001133 1.767711e-01 # 6 0.05 0.07 0.0500 0.0 0.006265 9.170460e-02 # 7 0.07 0.09 0.0900 0.0 0.002383 2.701134e-01