def test_syntax11_append_insert(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) exp = Pipeline() exp.append( ("OneHotHashVectorizer", OneHotHashVectorizer() << { 'edu2': 'education'})) exp.insert(0, OneHotVectorizer() << {'edu1': 'education'}) exp.append( FastLinearBinaryClassifier( maximum_number_of_iterations=1) << { 'Features': [ 'edu1', 'edu2'], Role.Label: 'y'}) exp.append(OneHotHashVectorizer() << {'edu2': 'education'}) del exp[-1] assert len(exp) == 3 exp.fit(df, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3) try: exp.append(OneHotHashVectorizer() << {'edu2': 'education'}) except RuntimeError as e: assert "Model is fitted and cannot be modified" in str(e) try: exp.insert(0, OneHotHashVectorizer() << {'edu2': 'education'}) except RuntimeError as e: assert "Model is fitted and cannot be modified" in str(e) try: del exp[0] except RuntimeError as e: assert "Model is fitted and cannot be modified" in str(e) obj = exp[1][1] assert obj.__class__.__name__ == "OneHotHashVectorizer" obj = exp[1][1] assert obj.__class__.__name__ == "OneHotHashVectorizer" res = exp['OneHotHashVectorizer'] assert len(res) == 1 graph = exp.graph_ assert len(graph.nodes) >= len(exp)
def test_syntax3(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << 'education', OneHotVectorizer(max_num_terms=2) << 'workclass', # Currently the learner does not use edu1 # unless it is specified explicitely so nimbusml # does not do what the syntax implicetely tells. # We need to modify either the bridge to look into # every available column at one step. FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax11_learner(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, FastLinearBinaryClassifier(max_iterations=1) << { 'Features': ['edu1', 'edu2'], Role.Label: 'y' } ]) exp.fit(df) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax6_regular_expression(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': ['f%d' % i for i in range(1, 4)] }, Drop() << '~Features', FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax5_regular_expression(self): # REVIEW: not implemented yet # The best would be to handle regular expression inside nimbusml. # It could be handled in entrypoint.py just before calling nimbusml. # It can be handled inside Pipeline if it is aware of # the input schema. df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': 'f[0-9]+' }, FastLinearBinaryClassifier(max_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax4_fail2(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu4', 'wki'] ]) try: exp.fit(X, y) raise AssertionError("The test should not reach this line.") except Exception as e: assert "Feature column 'edu4' not found" in str(e)
def test_syntax4_fail(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu2', 'wki'] ]) try: exp.fit(X, y) assert False except RuntimeError as e: assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \ in str(e)
def test_syntax4_dict(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, Concat() << { 'Inputs': ['edu1', 'edu2', 'wki'] }, FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_clone_sweep(self): # grid search, then clone pipeline and grid search again # results should be same np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) pipe1 = pipe.clone() grid1 = GridSearchCV(pipe1, param_grid) grid1.fit(X_train, y_train) assert grid.best_params_[ 'learner__number_of_trees'] == grid1.best_params_[ 'learner__number_of_trees']
def test_hyperparameters_sweep(self): # general test with combination of named and unnamed steps np.random.seed(0) df = pd.DataFrame( dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', # number_of_trees 0 will actually be never run by grid search ('learner', FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2)) ]) param_grid = dict(cat__output_kind=['Indicator', 'Binary'], learner__number_of_trees=[1, 2, 3]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) print(grid.best_params_) assert grid.best_params_ == { 'cat__output_kind': 'Indicator', 'learner__number_of_trees': 1 }
def test_syntax6_change_role(self): # REVIEW: the pipeline drops all columns but one --> # nimbusml still thinks the Features are eduction, workclass # and does not automatically detects that the only remaining # columns should play that role # (maybe because the label column is here too even though # the only remaining column without a role is Features). df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << {'f1': 'education'}, OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_split_start(self): long_transforms = [ OneHotVectorizer(columns={'edu': 'education'}), OneHotHashVectorizer(columns={'edu_hash': 'education'}), ColumnDropper(columns='education') ] pipeline = self.pipeline( transforms=long_transforms, learner_arguments={'feature': ['Features', 'edu', 'edu_hash']}) check_cv(pipeline, self.data('Label'), split_start='try_all')
def test_error_conditions(self): # grid search on a wrong param np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__wrong_arg=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) assert_raises(ValueError, grid.fit, X_train, y_train)
def check_cv_with_defaults_df( self, label_name='rank', group_id='group', features=['price', 'Class', 'dep_day', 'nbr_stops', 'duration'], **params): steps = [ OneHotHashVectorizer(output_kind='Key') << { group_id: group_id }, LightGbmRanker(min_data_per_leaf=1, feature=features, label='rank', group_id='group') ] data = self.data_pandas() check_cv(pipeline=Pipeline(steps), X=data, **params)
def check_cv_with_defaults2(self, label_name='Label', group_id='GroupId', features='Features_1', **params): steps = [ OneHotHashVectorizer(output_kind='Key') << { group_id: group_id }, ColumnConcatenator() << { 'Features': [features] }, LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id } ] data = self.data_wt_rename(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def test_numeric_columns(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', numeric_dtype=np.float32) xf = OneHotHashVectorizer( columns={ 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous'}, number_of_bits=2) xf.fit_transform(data) xf = OneHotHashVectorizer( columns=[ 'education', 'induced', 'spontaneous'], number_of_bits=2) xf.fit_transform(data)
def check_cv_with_defaults(self, label_name='Label', group_id='GroupId', features='Features_1', **params): steps = [ OneHotHashVectorizer(output_kind='Key') << { group_id: group_id }, # even specify all the roles neede in the following line, the # roles are still not passed correctly LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id, Role.Feature: features, Role.Label: label_name } ] data = self.data(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def test_uciadult_sweep(self): # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns # number_of_trees 100 will actually be never run by grid search # as its not in param_grid below learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) assert grid.best_params_['learner__number_of_trees'] == 10 # compare AUC on number_of_trees 1, 5, 10 pipe.set_params(learner__number_of_trees=1) pipe.fit(X_train, y_train) metrics1, _ = pipe.test(X_train, y_train) pipe.set_params(learner__number_of_trees=5) pipe.fit(X_train, y_train) metrics5, _ = pipe.test(X_train, y_train) pipe.set_params(learner__number_of_trees=10) pipe.fit(X_train, y_train) metrics10, _ = pipe.test(X_train, y_train) assert metrics10['AUC'][0] > metrics5['AUC'][0] assert metrics10['AUC'][0] > metrics1['AUC'][0] assert metrics10['AUC'][0] > 0.59
# GridSearchCV with Pipeline: hyperparameter grid search. import pandas as pd from nimbusml import Pipeline from nimbusml.ensemble import FastTreesBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \ OneHotVectorizer from sklearn.model_selection import GridSearchCV df = pd.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', # this instance of FastTreesBinaryClassifier with num_trees 0 will be # never run by grid search as its not a part of param_grid below ('learner', FastTreesBinaryClassifier(num_trees=0, num_leaves=2)) ]) param_grid = dict(cat__output_kind=['Ind', 'Bin'], learner__num_trees=[1, 2, 3]) grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn') grid.fit(X, y) print(grid.best_params_) # {'cat__output_kind': 'Ind', 'learner__num_trees': 1}
def test_performance_syntax(self): train_file = get_dataset('uciadult_train').as_filepath() test_file = get_dataset('uciadult_test').as_filepath() file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 ' \ 'col=workclass:TX:1 col=education:TX:2 ' \ 'col=marital-status:TX:3 col=occupation:TX:4 ' \ 'col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] label_column = 'label' na_columns = ['Features'] feature_columns_idv = na_columns + categorical_columns exp = Pipeline([ OneHotHashVectorizer(columns=categorical_columns), Handler(columns=na_columns), FastLinearBinaryClassifier(feature=feature_columns_idv, label=label_column) ]) train_data = FileDataStream(train_file, schema=file_schema) exp.fit(train_data, label_column, verbose=0) print("train time %s" % exp._run_time) test_data = FileDataStream(test_file, schema=file_schema) out_data = exp.predict(test_data) print("predict time %s" % exp._run_time) (test, label_test) = get_X_y(test_file, label_column, sep=',') (acc1, auc1) = evaluate_binary_classifier( label_test.iloc[:, 0].values, out_data.loc[:, 'PredictedLabel'].values, out_data.loc[:, 'Probability'].values) print('ACC %s, AUC %s' % (acc1, auc1)) exp = Pipeline([ OneHotHashVectorizer() << categorical_columns, Handler() << na_columns, FastLinearBinaryClassifier() << feature_columns_idv ]) train_data = FileDataStream(train_file, schema=file_schema) exp.fit(train_data, label_column, verbose=0) print("train time %s" % exp._run_time) test_data = FileDataStream(test_file, schema=file_schema) out_data = exp.predict(test_data) print("predict time %s" % exp._run_time) (test, label_test) = get_X_y(test_file, label_column, sep=',') (acc2, auc2) = evaluate_binary_classifier( label_test.iloc[:, 0].values, out_data.loc[:, 'PredictedLabel'].values, out_data.loc[:, 'Probability'].values) print('ACC %s, AUC %s' % (acc2, auc2)) assert abs(acc1 - acc2) < 0.02 assert abs(auc1 - auc2) < 0.02
'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'}) ]), 'MutualInformationSelector': Pipeline([ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), MutualInformationSelector( columns='Features', label='Label', slots_in_output=2) # only accept one column ]), 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(), char_feature_extractor=Ngram(), keep_diacritics=True, columns={ 'features': ['SentimentText']}), 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']), 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']), 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \ OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs'], label='induced'), 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \ OneVsRestClassifier(LinearSvmBinaryClassifier(), use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs',
############################################################################### # OneHotHashVectorizer from nimbusml import FileDataStream, Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotHashVectorizer from nimbusml.feature_selection import CountSelector # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',') print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... pip = Pipeline([ OneHotHashVectorizer(columns={'edu': 'education'}, hash_bits=2), CountSelector(count=5, columns=['edu']) ]) features_selection = pip.fit_transform(data) print(features_selection.head()) # age case edu.0 edu.1 education induced parity pooled.stratum ... # 0 26 1 0.0 1.0 0-5yrs 1 6 3 ... # 1 42 1 0.0 1.0 0-5yrs 1 1 1 ... # 2 39 1 0.0 1.0 0-5yrs 2 6 4 ... # 3 34 1 0.0 1.0 0-5yrs 2 4 2 ... # 4 35 1 1.0 0.0 6-11yrs 1 3 32 ...
############################################################################### # OneHotHashVectorizer from nimbusml import FileDataStream, Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotHashVectorizer from nimbusml.feature_selection import CountSelector # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',') print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... pip = Pipeline([ OneHotHashVectorizer(columns={'edu': 'education'}, number_of_bits=2), CountSelector(count=5, columns=['edu']) ]) features_selection = pip.fit_transform(data) print(features_selection.head()) # age case edu.0 edu.1 education induced parity pooled.stratum ... # 0 26 1 0.0 1.0 0-5yrs 1 6 3 ... # 1 42 1 0.0 1.0 0-5yrs 1 1 1 ... # 2 39 1 0.0 1.0 0-5yrs 2 6 4 ... # 3 34 1 0.0 1.0 0-5yrs 2 4 2 ... # 4 35 1 1.0 0.0 6-11yrs 1 3 32 ...
True, False, True, False, True, False, True ])) test_reviews = pandas.DataFrame(data=dict(review=[ "This is great", "I hate it", "Love it", "Really like it", "I hate it", "I like it a lot", "I love it", "I do like it", "I really hate it", "I love it" ])) # OneHotHashVectorizer transform: the entire string is treated as a category. # if output column name is same as input column, original input column values # are replaced. number_of_bits=6 will hash into 2^6 -1 dimensions y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] cat = OneHotHashVectorizer(number_of_bits=6) << 'review' X = cat.fit_transform(X) # view the transformed numerical values and column names print(X) mymodel = LogisticRegressionBinaryClassifier().fit(X, y) X_test = cat.transform(test_reviews) scores = mymodel.predict(cat.transform(test_reviews)) # view the scores print(scores)
# GridSearchCV with Pipeline: grid search over learners import pandas as pd from nimbusml import Pipeline from nimbusml.ensemble import FastTreesBinaryClassifier, GamBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotHashVectorizer from nimbusml.linear_model import FastLinearBinaryClassifier, \ LogisticRegressionBinaryClassifier from sklearn.model_selection import GridSearchCV df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] cat = OneHotHashVectorizer() << ['education', 'workclass'] learner = FastTreesBinaryClassifier() pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(cat__hash_bits=[1, 2, 4, 6, 8, 16], learner=[ FastLinearBinaryClassifier(), FastTreesBinaryClassifier(), LogisticRegressionBinaryClassifier(), GamBinaryClassifier() ]) grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn', ) grid.fit(X, y) print(grid.best_params_['learner'].__class__.__name__) # FastLinearBinaryClassifier
############################################################################### # OneHotHashVectorizer from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotHashVectorizer # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', dtype={'spontaneous': str }) # Error with numeric input for ohhv print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... xf = OneHotHashVectorizer(columns={'edu': 'education', 'sp': 'spontaneous'}) # fit and transform features = xf.fit_transform(data) print(features.head()) # age case edu.0 edu.1003 ... sp.995 ... spontaneous stratum # 0 26 1 0.0 0.0 ... 0.0 ... 2.0 1.0 # 1 42 1 0.0 0.0 ... 0.0 ... 0.0 2.0 # 2 39 1 0.0 0.0 ... 0.0 ... 0.0 3.0