def test_lightgbmranker_asfilestream(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() # Pure-nimbusml paradigm train_stream = FileDataStream.read_csv(file_path, encoding='utf-8') # pipeline pipeline = Pipeline([ # the group_id column must be of key type ToKey(columns={'rank': 'rank', 'group': 'group'}), LightGbmRanker( feature=[ 'Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train pipeline.fit(train_stream) # test eval_stream = FileDataStream.read_csv(file_path) metrics, _ = pipeline.test(eval_stream) assert_almost_equal( metrics['NDCG@1'][0], 0.43571429, decimal=7, err_msg="NDCG@1 should be %s" % 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], 0.5128226, decimal=7, err_msg="NDCG@2 should be %s" % 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], 0.55168069, decimal=7, err_msg="NDCG@3 should be %s" % 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal( metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal( metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
# -------------------------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- import unittest import pandas from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer path = get_dataset("wiki_detox_train").as_filepath() data = FileDataStream.read_csv(path, sep='\t') df = data.to_df().head() X = df['SentimentText'] class TestPipelineTransformMethod(unittest.TestCase): def test_transform_only_pipeline_transform_method(self): p = Pipeline( [NGramFeaturizer(char_feature_extractor=None) << 'SentimentText']) p.fit(X) xf = p.transform(X) assert 'SentimentText.==rude==' in xf.columns if __name__ == '__main__': unittest.main()
############################################################################### # CV - cross-validate data import numpy as np from nimbusml import Pipeline, FileDataStream, DataSchema from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import LogisticRegressionClassifier, \ FastLinearRegressor from nimbusml.model_selection import CV from nimbusml.preprocessing.missing_values import Indicator, Handler # Case 1: Default usage of CV path = get_dataset('infert').as_filepath() schema = DataSchema.read_schema(path, numeric_dtype=np.float32) data = FileDataStream.read_csv(path, schema=schema) pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'], label='induced') ]) # Do 3-fold cross-validation cv_results = CV(pipeline).fit(data, cv=3) # print summary statistic of metrics print(cv_results['metrics_summary']) # print metrics for all folds print(cv_results['metrics'])
############################################################################### # LightLda from nimbusml import FileDataStream, Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer, LightLda from nimbusml.feature_extraction.text.extractor import Ngram # data input as a FileDataStream path = get_dataset('topics').as_filepath() data = FileDataStream.read_csv(path, sep=",") print(data.head()) # review review_reverse label # 0 animals birds cats dogs fish horse radiation galaxy universe duck 1 # 1 horse birds house fish duck cats space galaxy universe radiation 0 # 2 car truck driver bus pickup bus pickup 1 # transform usage pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None', columns=['review']), LightLda(num_topic=3, columns=['review']) ]) # fit and transform features = pipeline.fit_transform(data) print(features.head()) # label review.0 review.1 review.2 review_reverse # 0 1 0.500000 0.333333 0.166667 radiation galaxy universe duck # 1 0 0.000000 0.166667 0.833333 space galaxy universe radiation # 2 1 0.400000 0.200000 0.400000 bus pickup
############################################################################### # NGramFeaturizer from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.feature_extraction.text.stopwords import CustomStopWordsRemover # data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() data = FileDataStream.read_csv(path, sep='\t') print(data.head()) # Sentiment SentimentText # 0 1 ==RUDE== Dude, you are rude upload that carl p... # 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1 Stop trolling, zapatancas, calling me a liar m... # 3 1 ==You're cool== You seem like a really cool g... # 4 1 ::::: Why are you threatening me? I'm not bein... xf = NGramFeaturizer(word_feature_extractor=Ngram(), stop_words_remover=CustomStopWordsRemover(['!', '$', '%', '&', '\'', '\'d']), columns={'features': ['SentimentText']}) # fit and transform features = xf.fit_transform(data)
############################################################################### # ColumnSelector from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing.schema import ColumnSelector # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',') # transform usage xf = ColumnSelector(columns=['education', 'age']) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # age education # 0 26 0-5yrs # 1 42 0-5yrs # 2 39 0-5yrs # 3 34 0-5yrs # 4 35 6-11yrs
############################################################################### # SsaForecaster import pandas as pd from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.timeseries import SsaForecaster # data input (as a FileDataStream) path = get_dataset('timeseries').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # t1 t2 t3 # 0 0.01 0.01 0.0100 # 1 0.02 0.02 0.0200 # 2 0.03 0.03 0.0200 # 3 0.03 0.03 0.0250 # 4 0.03 0.03 0.0005 # define the training pipeline pipeline = Pipeline([ SsaForecaster(series_length=6, train_size=8, window_size=3, horizon=2, columns={'t2_fc': 't2'}) ]) result = pipeline.fit_transform(data) pd.set_option('display.float_format', lambda x: '%.2f' % x)
import pandas as pd import six from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastTreesBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \ OneHotVectorizer from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text import WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.linear_model import FastLinearBinaryClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import GridSearchCV from sklearn.utils.testing import assert_raises train_file = get_dataset('uciadult_train').as_filepath() test_file = get_dataset('uciadult_test').as_filepath() categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] label_column = 'label' class TestSweep(unittest.TestCase): def test_hyperparameters_sweep(self): # general test with combination of named and unnamed steps np.random.seed(0) df = pd.DataFrame( dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
# -------------------------------------------------------------------------------------------- import unittest import numpy as np import pandas as pd from nimbusml import FileDataStream from nimbusml import Pipeline, Role from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmRanker from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import FastLinearRegressor from nimbusml.preprocessing import ToKey from nimbusml.utils.exports import dot_export_pipeline # set up dataframe path = get_dataset("gen_tickettrain").as_filepath() df = pd.read_csv(path) df = df.rename(index=str, columns={'rank': 'label_1', 'group': 'group_2'}) features = ['price', 'Class', 'duration'] df['group_2'] = df['group_2'].astype(np.uint32) X = df.drop(['label_1'], axis=1) y = df['label_1'] # set up filedatastream fds = FileDataStream.read_csv(path, names={0: 'label_1', 1: 'group_2'}) def clone_and_check(pipe): pipe_attrs = pipe.__dict__.copy() cloned_pipe = pipe.clone() cloned_attrs = cloned_pipe.__dict__.copy()
def test_syntax_slots_wo_pipeline(self): # data df = get_dataset("infert").as_df() df = df.drop(['row_num', ], axis=1) X = df.drop('case', axis=1) y = df['case'] # transform xf1 = OneHotVectorizer(columns=['age', 'parity', 'education_str']) X_xf1 = xf1.fit_transform(X, verbose=0) assert "age.21" in list(X_xf1.columns) # learner # (1 .a.) model = AveragedPerceptronBinaryClassifier() # (1. b) try: model = AveragedPerceptronBinaryClassifier(feature=['age']) model.fit(X_xf1, y, verbose=0) cont = True assert False except Exception as e: # does not work cont = False print(e) if cont: y_pred = model.predict(X_xf1) assert y_pred.shape == (248, 3) pipeline = Pipeline([ OneHotVectorizer(columns=['age', 'parity', 'education_str']), AveragedPerceptronBinaryClassifier(feature='age') ]) pipeline.fit(X, y, verbose=0) y_pred_withpipeline = pipeline.predict(X) print(y_pred_withpipeline.head()) assert y_pred_withpipeline.shape == (248, 3) metrics, scores = pipeline.test(X, y, output_scores=True) print(metrics) assert scores.shape == (248, 3) assert metrics.shape == (1, 11) # back to X_xf1 print(list(X_xf1.columns)) l1 = list(sorted(set(_.split('.')[-1] for _ in X_xf1.columns))) levels = [['age', 'education', 'education_str', 'parity', 'pooled', 'spontaneous', 'stratum', 'induced'], [''] + l1] names = ['columns', 'slots'] labels = [[], []] ages = [] for _ in X_xf1.columns: spl = _.split('.') l1 = levels[0].index(spl[0]) try: l2 = levels[1].index(spl[1]) except IndexError: l2 = levels[1].index('') labels[0].append(l1) labels[1].append(l2) if spl[0] == 'age': ages.append(l2) X_xf1.columns = pandas.MultiIndex( levels=levels, labels=labels, names=names) print(X_xf1.head(n=2).T) col_ages = [('age', a) for a in ages] print(col_ages) try: model = AveragedPerceptronBinaryClassifier(feature=col_ages) model.fit(X_xf1, y, verbose=0) y_pred = model.predict(X_xf1) assert y_pred.shape == (248, 3) except Exception as e: # Does not work, probably confusion between list and tuple in nimbusml print(e) try: model = AveragedPerceptronBinaryClassifier(feature=['age']) model.fit(X_xf1, y, verbose=0) y_pred = model.predict(X_xf1) assert y_pred.shape == (248, 3) except Exception as e: # Does not work. print(e)
def setUpClass(cls): df = get_dataset("infert").as_df() # remove : and ' ' from column names, and encode categorical column df.columns = [i.replace(': ', '') for i in df.columns] df = (OneHotVectorizer() << ['education_str']).fit_transform(df) cls.X, cls.y = split_features_and_label(df, 'case')
def setUpClass(cls): df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) cls.X, cls.y = split_features_and_label(df, 'Label')
import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastTreesBinaryClassifier from sklearn.model_selection import train_test_split # Use the built-in data set 'infert' to create test and train data # Unnamed: 0 education age parity induced case spontaneous stratum \ # 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 # 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 # pooled.stratum education_str # 0 3.0 0-5yrs # 1 1.0 0-5yrs np.random.seed(0) df = get_dataset("infert").as_df()[['stratum', 'parity', 'case']] # remove : and ' ' from column names df.columns = [i.replace(': ', '') for i in df.columns] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'case'], df['case']) # train model and see accuracy ftree = FastTreesBinaryClassifier().fit(X_train, y_train) scores = ftree.predict(X_test) print('Accuracy1:', np.mean(y_test == [i for i in scores])) # Unpickle model and score. We should get the exact same accuracy as above s = pickle.dumps(ftree) ftree2 = pickle.loads(s)
############################################################################### # OrdinaryLeastSquaresRegressor from nimbusml import Pipeline, FileDataStream, Role from nimbusml.datasets import get_dataset from nimbusml.linear_model import OrdinaryLeastSquaresRegressor from nimbusml.preprocessing.missing_values import Filter # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 train_file = get_dataset("airquality").as_filepath() schema = "col=none:R4:0 col=ozone:R4:1 col=solar:R4:2 col=wind:R4:3 " \ "col=temp:R4:4 col=month:R4:5 col=day:R4:6 sep=, header=+" fds = FileDataStream(train_file, schema=schema) # set up pipeline pipe = Pipeline([ Filter() << ['ozone'], OrdinaryLeastSquaresRegressor() << { Role.Label: 'ozone', Role.Feature: ['solar', 'wind', 'temp', 'month', 'day'] } ]) # train and evaluate the model metrics, scores = pipe.fit(fds).test(fds, "ozone", output_scores=True) print(metrics)
# FastForestBinaryClassifier import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastForestBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotVectorizer from sklearn.model_selection import train_test_split # use the built-in data set 'infert' to create test and train data # Unnamed: 0 education age parity induced case spontaneous stratum \ # 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 # 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 # pooled.stratum education_str # 0 3.0 0-5yrs # 1 1.0 0-5yrs np.random.seed(0) df = get_dataset("infert").as_df() # remove : and ' ' from column names, and encode categorical column df.columns = [i.replace(': ', '') for i in df.columns] df = (OneHotVectorizer() << 'education_str').fit_transform(df) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'case'], df['case']) fforest = FastForestBinaryClassifier().fit(X_train, y_train) scores = fforest.predict(X_test) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
# Pipeline with observation level feature contributions # Scoring a dataset with a trained model produces a score, or prediction, for # each example. To understand and explain these predictions it can be useful to # inspect which features influenced them most significantly. This function # computes a model-specific list of per-feature contributions to the score for # each example. These contributions can be positive (they make the score # higher) or negative (they make the score lower). from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastTreesBinaryClassifier from nimbusml.linear_model import LogisticRegressionBinaryClassifier # data input (as a FileDataStream) path = get_dataset('uciadult_train').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # label workclass education ... capital-loss hours-per-week # 0 0 Private 11th ... 0 40 # 1 0 Private HS-grad ... 0 50 # 2 1 Local-gov Assoc-acdm ... 0 40 # 3 1 Private Some-college ... 0 40 # 4 0 ? Some-college ... 0 30 # define the training pipeline with a linear model lr_pipeline = Pipeline([LogisticRegressionBinaryClassifier( feature=['age', 'education-num', 'hours-per-week'], label='label')]) # train the model
def data_pandas(self): simpleinput_file = get_dataset("gen_tickettrain").as_filepath() data = pd.read_csv(simpleinput_file) data['group'] = data['group'].astype(str) return data
# PcaTransformer from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.decomposition import PcaTransformer from nimbusml.ensemble import LightGbmBinaryClassifier from nimbusml.preprocessing.schema import ColumnConcatenator # use the built-in data set 'infert' to create test and train data # Unnamed: 0 education age parity induced case spontaneous stratum \ # 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 # 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 # pooled.stratum education_str # 0 3.0 0-5yrs # 1 1.0 0-5yrs train_file = get_dataset("infert").as_filepath() schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \ "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \ "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \ "sep=, header=+" fds = FileDataStream(train_file, schema=schema) # target and features columns y = 'case' X = ['age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooledstratum'] # observe gradual impact of dimensionality reduction on AUC # reducing dimensions should degrade signal gradually, while # maintaining the traits in original dataset as much as possible. for rank in range(len(X), 2, -1): print('Number of dimensions=', rank)
############################################################################### # LightGbmRanker import numpy as np from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmRanker # data input (as a FileDataStream) path = get_dataset('gen_tickettrain').as_filepath() # LightGbmRanker requires key type for group column data = FileDataStream.read_csv(path, dtype={'group': np.uint32}) # define the training pipeline pipeline = Pipeline([ LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train, predict, and evaluate. # TODO: Replace with CV metrics, predictions = pipeline \ .fit(data) \ .test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 -0.124121 # 1 -0.124121
from nimbusml.linear_model import AveragedPerceptronBinaryClassifier from nimbusml.linear_model import FastLinearBinaryClassifier from nimbusml.linear_model import FastLinearClassifier from nimbusml.linear_model import FastLinearRegressor from nimbusml.linear_model import LogisticRegressionBinaryClassifier from nimbusml.linear_model import LogisticRegressionClassifier from nimbusml.linear_model import OnlineGradientDescentRegressor from nimbusml.linear_model import OrdinaryLeastSquaresRegressor from nimbusml.linear_model import PoissonRegressionRegressor from nimbusml.linear_model import SgdBinaryClassifier # from nimbusml.linear_model import SymSgdBinaryClassifier from nimbusml.multiclass import OneVsRestClassifier from nimbusml.naive_bayes import NaiveBayesClassifier from sklearn.utils.testing import assert_raises train_file = get_dataset("uciadult_train").as_filepath() categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \ 'col=education:TX:2 col=marital-status:TX:3 ' \ 'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(),
from nimbusml.datasets import get_dataset from nimbusml.ensemble import EnsembleRegressor from nimbusml.ensemble.feature_selector import RandomFeatureSelector from nimbusml.ensemble.output_combiner import RegressorMedian from nimbusml.ensemble.subset_selector import RandomPartitionSelector from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 np.random.seed(0) df = get_dataset("airquality").as_df().fillna(0) df = df[df.Ozone.notnull()] X_train, X_test, y_train, y_test = train_test_split( df.loc[:, df.columns != 'Ozone'], df['Ozone']) # train a model with default sampling and ensembling parameters and score ensemble_with_defaults = EnsembleRegressor(num_models=3).fit(X_train, y_train) scores = ensemble_with_defaults.predict(X_test) # evaluate the model print('R-squared fit:', r2_score( y_test, scores, )) # R-squared fit: 0.12144964995862884
############################################################################### # LightGbmClassifier import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmClassifier from sklearn.model_selection import train_test_split # use 'iris' data set to create test and train data # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = LightGbmClassifier().fit(X_train, y_train) scores = lr.predict(X_test) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
def infert_df(label_name): df = get_dataset('infert').as_df() df = (OneHotVectorizer() << 'education_str').fit_transform(df) X, y = split_features_and_label(df, label_name) return X, y
SHOW_ONNX_JSON = False SHOW_TRANSFORMED_RESULTS = False SHOW_FULL_PANDAS_OUTPUT = False if SHOW_FULL_PANDAS_OUTPUT: pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.width', 10000) script_path = os.path.realpath(__file__) script_dir = os.path.dirname(script_path) # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 iris_df = get_dataset("iris").as_df() iris_df.drop(['Species'], axis=1, inplace=True) iris_with_nan_df = iris_df.copy() iris_with_nan_df.loc[1, 'Petal_Length'] = np.nan iris_no_label_df = iris_df.drop(['Label'], axis=1) iris_binary_df = iris_no_label_df.rename(columns={'Setosa': 'Label'}) iris_regression_df = iris_no_label_df.drop( ['Setosa'], axis=1).rename(columns={'Petal_Width': 'Label'}) # Unnamed: 0 education age parity induced case spontaneous stratum pooled.stratum education_str # 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 3.0 0-5yrs # 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 1.0 0-5yrs infert_df = get_dataset("infert").as_df() infert_df.columns = [i.replace(': ', '') for i in infert_df.columns]
# permutation analysis across allthe features of a model, one after another. # PFI is supported for binary classifiers, classifiers, regressors, and # rankers. from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmRanker from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import LogisticRegressionBinaryClassifier, \ FastLinearClassifier, FastLinearRegressor from nimbusml.preprocessing import ToKey from numpy.testing import assert_almost_equal # data input (as a FileDataStream) adult_path = get_dataset('uciadult_train').as_filepath() classification_data = FileDataStream.read_csv(adult_path) print(classification_data.head()) # label workclass education ... capital-loss hours-per-week # 0 0 Private 11th ... 0 40 # 1 0 Private HS-grad ... 0 50 # 2 1 Local-gov Assoc-acdm ... 0 40 # 3 1 Private Some-college ... 0 40 # 4 0 ? Some-college ... 0 30 ###################################### # PFI for Binary Classification models ###################################### # define the training pipeline with a binary classifier binary_pipeline = Pipeline([ OneHotVectorizer(columns=['education']),