def test_check_estimator_fromkey(self): text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"], num=[ 1, 2, 3, 4, 5, 6, 7, 8])) tokey = ToKey() << ['text'] data_idv = tokey.fit_transform(text_df) assert data_idv is not None assert len(data_idv) > 0 assert str(sorted([str(dt) for dt in data_idv.dtypes]) ) == "['category', 'int64']" fromkey = FromKey() << ['text'] data = fromkey.fit_transform(data_idv) assert str(list(data_idv['text'])) == str(list(data['text'])) t = numpy.unique(data_idv['text'].cat.codes) assert len(t) == 6 assert list(data_idv['text'].cat.categories) == [ "cat", "dog", "fish", "orange", "cat orange", "spider"]
def test_pipeline_clone_filedatastream_roles_shift_operator(self): pipe = Pipeline([ ToKey() << {'group_2': 'group_2'}, LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << { Role.Feature: features, Role.Label: 'label_1', Role.GroupId: 'group_2'} ]) fit_test_clone_and_check(pipe, fds)
def test_check_estimator_fromkey_categories(self): text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"]), dtype="category") tokey = ToKey() << ['text'] data_idv = tokey.fit_transform(text_df) assert data_idv is not None assert len(data_idv) > 0 assert data_idv['text'].dtype == 'category'
def test_pipeline_clone_filedatastream_roles_arguments(self): pipe = Pipeline([ ToKey() << {'group_2': 'group_2'}, LightGbmRanker(feature=features, label='label_1', group_id='group_2', number_of_iterations=1, number_of_leaves=4) ]) fit_test_clone_and_check(pipe, fds)
def test_lightgbmranker_asdataframe(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() df = pd.read_csv(file_path, encoding='utf-8') df['group'] = df['group'].astype(np.uint32) e = Pipeline([ToKey(columns={'rank': 'rank', 'group': 'group'}), LightGbmRanker() << { Role.Feature: ['Class', 'dep_day', 'duration'], Role.Label: 'rank', Role.GroupId: 'group'}]) e.fit(df) metrics, _ = e.test(df) assert_almost_equal( metrics['NDCG@1'][0], 0.43571429, decimal=7, err_msg="NDCG@1 should be %s" % 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], 0.5128226, decimal=7, err_msg="NDCG@2 should be %s" % 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], 0.55168069, decimal=7, err_msg="NDCG@3 should be %s" % 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal( metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal( metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def setUpClass(self): adult_path = get_dataset('uciadult_train').as_filepath() self.classification_data = FileDataStream.read_csv(adult_path) binary_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), LogisticRegressionBinaryClassifier(feature=['age', 'education'], label='label', number_of_threads=1) ]) self.binary_model = binary_pipeline.fit(self.classification_data) self.binary_pfi = self.binary_model.permutation_feature_importance( self.classification_data) classifier_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearClassifier(feature=['age', 'education'], label='label', number_of_threads=1, shuffle=False) ]) self.classifier_model = classifier_pipeline.fit( self.classification_data) self.classifier_pfi = self.classifier_model.permutation_feature_importance( self.classification_data) infert_path = get_dataset('infert').as_filepath() self.regression_data = FileDataStream.read_csv(infert_path) regressor_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearRegressor(feature=['induced', 'education'], label='age', number_of_threads=1, shuffle=False) ]) self.regressor_model = regressor_pipeline.fit(self.regression_data) self.regressor_pfi = self.regressor_model.permutation_feature_importance( self.regression_data) ticket_path = get_dataset('gen_tickettrain').as_filepath() self.ranking_data = FileDataStream.read_csv(ticket_path) ranker_pipeline = Pipeline([ ToKey(columns=['group']), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group', random_state=0, number_of_threads=1) ]) self.ranker_model = ranker_pipeline.fit(self.ranking_data) self.ranker_pfi = self.ranker_model.permutation_feature_importance( self.ranking_data)
def test_lightgbmranker_asfilestream(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() # Pure-nimbusml paradigm train_stream = FileDataStream.read_csv(file_path, encoding='utf-8') # pipeline pipeline = Pipeline([ # the group_id column must be of key type ToKey(columns={ 'rank': 'rank', 'group': 'group' }), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train pipeline.fit(train_stream) # test eval_stream = FileDataStream.read_csv(file_path) metrics, _ = pipeline.test(eval_stream) assert_almost_equal(metrics['NDCG@1'][0], 43.571429, decimal=5, err_msg="NDCG@1 should be %s" % 43.571429) assert_almost_equal(metrics['NDCG@2'][0], 51.28226, decimal=5, err_msg="NDCG@2 should be %s" % 51.28226) assert_almost_equal(metrics['NDCG@3'][0], 55.168069, decimal=5, err_msg="NDCG@3 should be %s" % 55.168069) assert_almost_equal(metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal(metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal(metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def check_cv_with_defaults_df( self, label_name='rank', group_id='group', features=['price', 'Class', 'dep_day', 'nbr_stops', 'duration'], **params): steps = [ ToKey() << { group_id: group_id }, LightGbmRanker(min_data_per_leaf=1, feature=features, label='rank', group_id='group') ] data = self.data_pandas() check_cv(pipeline=Pipeline(steps), X=data, **params)
def test_lightgbmranker_asdataframe_groupid(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() df = pd.read_csv(file_path, encoding='utf-8') df['group'] = df['group'].astype(np.uint32) e = Pipeline([ ToKey(columns={ 'rank': 'rank', 'group': 'group' }), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) e.fit(df) metrics, _ = e.test(df) assert_almost_equal(metrics['NDCG@1'][0], 43.571429, decimal=5, err_msg="NDCG@1 should be %s" % 43.571429) assert_almost_equal(metrics['NDCG@2'][0], 51.28226, decimal=5, err_msg="NDCG@2 should be %s" % 51.28226) assert_almost_equal(metrics['NDCG@3'][0], 55.168069, decimal=5, err_msg="NDCG@3 should be %s" % 55.168069) assert_almost_equal(metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal(metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal(metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def check_cv_with_defaults2(self, label_name='Label', group_id='GroupId', features='Features_1', **params): # REVIEW: Replace back ToKey() with OneHotHashVectorizer() and reinstate metrics checks # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved. params.pop('expected_metrics', None) steps = [ ToKey() << { group_id: group_id }, ColumnConcatenator() << { 'Features': [features] }, LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id } ] data = self.data_wt_rename(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def test_fromkey_multiple_columns(self): df = pandas.DataFrame(data=dict( num1=[0, 1, 2, 3, 4, 5, 6], cat1=Categorical.from_codes([0, 2, 3, 1, 2, -1, 1], categories=["a", "b", "c", "d"]), cat2=Categorical.from_codes([2, 0, 1, 2, 0, 1, 1], categories=["e", "f", "g"]), num=[0, 1, 2, 3, 4, 5, 6], text1=["i", "j", "i", "j", "i", "j", "i"], text2=["k", "l", "l", "k", "k", "l", "k"])) concat = ColumnConcatenator() << {'textvec': ['text1', 'text2']} tokey = ToKey() << ['textvec'] pipeline = Pipeline([concat, tokey]) data_idv = pipeline.fit_transform(df) assert sorted( list( data_idv.columns)) == [ 'cat1', 'cat2', 'num', 'num1', 'text1', 'text2', 'textvec.text1', 'textvec.text2'] assert list(data_idv['cat1'].cat.categories) == ['a', 'b', 'c', 'd'] assert list(data_idv['cat1'].cat.codes) == [0, 2, 3, 1, 2, -1, 1] assert list(data_idv['cat2'].cat.categories) == ['e', 'f', 'g'] assert list(data_idv['cat2'].cat.codes) == [2, 0, 1, 2, 0, 1, 1] assert list( data_idv['textvec.text1'].cat.categories) == [ 'i', 'k', 'j', 'l'] assert list(data_idv['textvec.text1'].cat.codes) == [ 0, 2, 0, 2, 0, 2, 0] assert list( data_idv['textvec.text2'].cat.categories) == [ 'i', 'k', 'j', 'l'] assert list(data_idv['textvec.text2'].cat.codes) == [ 1, 3, 3, 1, 1, 3, 1]
def test_syntax12_group(self): # This tests check that a learner raises an exception # if a role is not allowed by the entrypoint. X = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], gr=[0, 0, 1, 1, 1], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), ToKey() << 'gr', FastTreesRegressor( number_of_trees=5, feature='Feature', group_id='gr') << { Role.Label: 'y' } ]) exp.fit(X, verbose=0) assert not hasattr(exp.nodes[-1], 'feature_') assert not hasattr(exp.nodes[-1], 'group_id_') assert exp.nodes[-1].feature_column_name_ == 'Feature' assert exp.nodes[-1].label_column_name_ == 'y' # assert not hasattr(exp.nodes[-1], 'row_group_column_name_') assert not hasattr(exp.nodes[-1], 'group_id_column') assert not hasattr(exp.nodes[-1], 'groupid_column_') assert not hasattr(exp.nodes[-1], 'groupid_column') if not hasattr(exp.nodes[-1], 'row_group_column_name_'): raise AssertionError("Attribute not found: {0}".format(", ".join( sorted(dir(exp.nodes[-1]))))) assert exp.nodes[-1].row_group_column_name_ == 'gr' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
def check_cv_with_defaults(self, label_name='Label', group_id='GroupId', features='Features_1', **params): # REVIEW: Replace back ToKey() with OneHotHashVectorizer() and reinstate metrics checks # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved. params.pop('expected_metrics', None) steps = [ ToKey() << { group_id: group_id }, # even specify all the roles needed in the following line, the # roles are still not passed correctly LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id, Role.Feature: features, Role.Label: label_name } ] data = self.data(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def check_cv_with_non_defaults(self, label_name='label', group_id='groupid', features='Features_1', **params): steps = [ ToKey(columns={ 'groupid2': group_id, 'label2': label_name }), LightGbmRanker() << { Role.GroupId: 'groupid2', Role.Label: 'label2', Role.Feature: [features] } ] data = self.data(label_name, group_id, features) cv = CV(steps) results = cv.fit(data, groups='groupid', cv=4) check_cv_results(cv._learner_type, results, n_folds=4, expected_metrics={})
def test_get_fit_info_ranker(self): file_path = get_dataset("gen_tickettrain").as_filepath() file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \ 'col=Features_3:R4:3-5' train_stream = FileDataStream(file_path, schema=file_schema) pipeline = Pipeline([ ToKey() << { 'GroupId_2': 'GroupId_2' }, ColumnConcatenator() << { 'Features': ['Features_3'] }, LightGbmRanker() << { Role.Feature: 'Features', Role.Label: 'Label_1', Role.GroupId: 'GroupId_2' } ]) info = pipeline.get_fit_info(train_stream) last = info[0][-1] inp = last['inputs'] assert 'GroupId:GroupId_2' in inp
############################################################################### # FromKey import pandas from nimbusml.preprocessing import FromKey, ToKey from pandas import Categorical # Create the data categorical_df = pandas.DataFrame(data=dict( key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']), text=['b', 'c', 'a', 'b', 'a', 'c'])) fromkey = FromKey(columns='key') y = fromkey.fit_transform(categorical_df) print(y) tokey = ToKey(columns='text') y = tokey.fit_transform(categorical_df) y2 = fromkey.clone().fit_transform(y) print(y2['text'] == categorical_df['text'])
import numpy from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing import ToKey # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32, names={0: 'id'}) print(data.head()) # age case education id induced parity pooled.stratum spontaneous ... # 0 26.0 1.0 0-5yrs 1.0 1.0 6.0 3.0 2.0 ... # 1 42.0 1.0 0-5yrs 2.0 1.0 1.0 1.0 0.0 ... # 2 39.0 1.0 0-5yrs 3.0 2.0 6.0 4.0 0.0 ... # 3 34.0 1.0 0-5yrs 4.0 2.0 4.0 2.0 0.0 .. # 4 35.0 1.0 6-11yrs 5.0 1.0 3.0 32.0 1.0 .. # transform usage xf = ToKey(columns={'id_1': 'id', 'edu_1': 'education'}) # fit and transform features = xf.fit_transform(data) print(features.head()) # age case edu_1 education id id_1 induced parity ... # 0 26.0 1.0 0-5yrs 0-5yrs 1.0 0 1.0 6.0 ... # 1 42.0 1.0 0-5yrs 0-5yrs 2.0 1 1.0 1.0 ... # 2 39.0 1.0 0-5yrs 0-5yrs 3.0 2 2.0 6.0 ... # 3 34.0 1.0 0-5yrs 0-5yrs 4.0 3 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 6-11yrs 5.0 4 1.0 3.0 ...
############################################################################### # FromKey from nimbusml import FileDataStream, Pipeline from nimbusml.datasets import get_dataset from nimbusml.preprocessing import FromKey, ToKey # data input (as a FileDataStream) path = get_dataset('topics').as_filepath() # load data data = FileDataStream.read_csv(path, sep=',') # transform usage pipeline = Pipeline([ ToKey(columns=['review_reverse']), FromKey(columns=['review_reverse']) ]) # fit and transform output = pipeline.fit_transform(data) print(output.head()) # label review review_reverse # 0 1 animals birds cats dogs fish horse radiation galaxy universe duck # 1 0 horse birds house fish duck cats space galaxy universe radiation # 2 1 car truck driver bus pickup bus pickup # 3 0 car truck driver bus pickup horse car truck # 4 1 car truck car truck driver bus pickup horse
'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']), 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']), 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
############################################################################### # ToKey import pandas from nimbusml.preprocessing import ToKey # Create the data text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"])) tokey = ToKey() << 'text' y = tokey.fit_transform(text_df) print(y)
# PFI for Ranking models ######################## # load input data ticket_path = get_dataset('gen_tickettrain').as_filepath() ranking_data = FileDataStream.read_csv(ticket_path) print(ranking_data.head()) # rank group carrier price Class dep_day nbr_stops duration # 0 2 1 AA 240 3 1 0 12.0 # 1 1 1 AA 300 3 0 1 15.0 # 2 1 1 AA 360 3 0 2 18.0 # 3 0 1 AA 540 2 0 0 12.0 # 4 1 1 AA 600 2 0 1 15.0 # define the training pipeline with a ranker ranking_pipeline = Pipeline([ ToKey(columns=['group']), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train the model ranking_model = ranking_pipeline.fit(ranking_data) # get permutation feature importance ranking_pfi = ranking_model.permutation_feature_importance(ranking_data) # Print PFI for each feature, ordered by most important features w.r.t. DCG@1. # Since DCG is an increasing metric, the highest negative changes indicate the # most important features. print("===================== PFI for Ranking Model =====================")