예제 #1
0
    def test_check_estimator_fromkey(self):
        text_df = pandas.DataFrame(
            data=dict(
                text=[
                    "cat",
                    "dog",
                    "fish",
                    "orange",
                    "cat orange",
                    "dog",
                    "fish",
                    "spider"],
                num=[
                    1,
                    2,
                    3,
                    4,
                    5,
                    6,
                    7,
                    8]))

        tokey = ToKey() << ['text']
        data_idv = tokey.fit_transform(text_df)
        assert data_idv is not None
        assert len(data_idv) > 0
        assert str(sorted([str(dt) for dt in data_idv.dtypes])
                   ) == "['category', 'int64']"
        fromkey = FromKey() << ['text']
        data = fromkey.fit_transform(data_idv)
        assert str(list(data_idv['text'])) == str(list(data['text']))
        t = numpy.unique(data_idv['text'].cat.codes)
        assert len(t) == 6
        assert list(data_idv['text'].cat.categories) == [
            "cat", "dog", "fish", "orange", "cat orange", "spider"]
예제 #2
0
 def test_pipeline_clone_filedatastream_roles_shift_operator(self):
     pipe = Pipeline([
         ToKey() << {'group_2': 'group_2'},
         LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << {
             Role.Feature: features,
             Role.Label: 'label_1',
             Role.GroupId: 'group_2'}
     ])
     fit_test_clone_and_check(pipe, fds)
예제 #3
0
    def test_check_estimator_fromkey_categories(self):
        text_df = pandas.DataFrame(
            data=dict(
                text=[
                    "cat",
                    "dog",
                    "fish",
                    "orange",
                    "cat orange",
                    "dog",
                    "fish",
                    "spider"]),
            dtype="category")

        tokey = ToKey() << ['text']
        data_idv = tokey.fit_transform(text_df)
        assert data_idv is not None
        assert len(data_idv) > 0
        assert data_idv['text'].dtype == 'category'
예제 #4
0
 def test_pipeline_clone_filedatastream_roles_arguments(self):
     pipe = Pipeline([
         ToKey() << {'group_2': 'group_2'},
         LightGbmRanker(feature=features,
                        label='label_1',
                        group_id='group_2',
                        number_of_iterations=1,
                        number_of_leaves=4)
     ])
     fit_test_clone_and_check(pipe, fds)
예제 #5
0
    def test_lightgbmranker_asdataframe(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        e = Pipeline([ToKey(columns={'rank': 'rank', 'group': 'group'}),
                      LightGbmRanker() << {
                          Role.Feature: ['Class', 'dep_day', 'duration'],
                          Role.Label: 'rank', Role.GroupId: 'group'}])

        e.fit(df)

        metrics, _ = e.test(df)
        assert_almost_equal(
            metrics['NDCG@1'][0],
            0.43571429,
            decimal=7,
            err_msg="NDCG@1 should be %s" %
                    0.43571429)
        assert_almost_equal(
            metrics['NDCG@2'][0],
            0.5128226,
            decimal=7,
            err_msg="NDCG@2 should be %s" %
                    0.5128226)
        assert_almost_equal(
            metrics['NDCG@3'][0],
            0.55168069,
            decimal=7,
            err_msg="NDCG@3 should be %s" %
                    0.55168069)
        assert_almost_equal(
            metrics['DCG@1'][0],
            4.688759,
            decimal=3,
            err_msg="DCG@1 should be %s" %
                    4.688759)
        assert_almost_equal(
            metrics['DCG@2'][0],
            9.012395,
            decimal=3,
            err_msg="DCG@2 should be %s" %
                    9.012395)
        assert_almost_equal(
            metrics['DCG@3'][0],
            11.446943,
            decimal=3,
            err_msg="DCG@3 should be %s" %
                    11.446943)
    def setUpClass(self):
        adult_path = get_dataset('uciadult_train').as_filepath()
        self.classification_data = FileDataStream.read_csv(adult_path)
        binary_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            LogisticRegressionBinaryClassifier(feature=['age', 'education'],
                                               label='label',
                                               number_of_threads=1)
        ])
        self.binary_model = binary_pipeline.fit(self.classification_data)
        self.binary_pfi = self.binary_model.permutation_feature_importance(
            self.classification_data)
        classifier_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearClassifier(feature=['age', 'education'],
                                 label='label',
                                 number_of_threads=1,
                                 shuffle=False)
        ])
        self.classifier_model = classifier_pipeline.fit(
            self.classification_data)
        self.classifier_pfi = self.classifier_model.permutation_feature_importance(
            self.classification_data)

        infert_path = get_dataset('infert').as_filepath()
        self.regression_data = FileDataStream.read_csv(infert_path)
        regressor_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearRegressor(feature=['induced', 'education'],
                                label='age',
                                number_of_threads=1,
                                shuffle=False)
        ])
        self.regressor_model = regressor_pipeline.fit(self.regression_data)
        self.regressor_pfi = self.regressor_model.permutation_feature_importance(
            self.regression_data)

        ticket_path = get_dataset('gen_tickettrain').as_filepath()
        self.ranking_data = FileDataStream.read_csv(ticket_path)
        ranker_pipeline = Pipeline([
            ToKey(columns=['group']),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group',
                           random_state=0,
                           number_of_threads=1)
        ])
        self.ranker_model = ranker_pipeline.fit(self.ranking_data)
        self.ranker_pfi = self.ranker_model.permutation_feature_importance(
            self.ranking_data)
예제 #7
0
    def test_lightgbmranker_asfilestream(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        train_stream = FileDataStream.read_csv(file_path, encoding='utf-8')

        # pipeline
        pipeline = Pipeline([
            # the group_id column must be of key type
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        # train
        pipeline.fit(train_stream)

        # test
        eval_stream = FileDataStream.read_csv(file_path)
        metrics, _ = pipeline.test(eval_stream)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
예제 #8
0
 def check_cv_with_defaults_df(
         self,
         label_name='rank',
         group_id='group',
         features=['price', 'Class', 'dep_day', 'nbr_stops', 'duration'],
         **params):
     steps = [
         ToKey() << {
             group_id: group_id
         },
         LightGbmRanker(min_data_per_leaf=1,
                        feature=features,
                        label='rank',
                        group_id='group')
     ]
     data = self.data_pandas()
     check_cv(pipeline=Pipeline(steps), X=data, **params)
예제 #9
0
    def test_lightgbmranker_asdataframe_groupid(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        e = Pipeline([
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        e.fit(df)

        metrics, _ = e.test(df)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
예제 #10
0
 def check_cv_with_defaults2(self,
                             label_name='Label',
                             group_id='GroupId',
                             features='Features_1',
                             **params):
     # REVIEW: Replace back ToKey() with OneHotHashVectorizer()  and reinstate metrics checks
     # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved.
     params.pop('expected_metrics', None)
     steps = [
         ToKey() << {
             group_id: group_id
         },
         ColumnConcatenator() << {
             'Features': [features]
         },
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id
         }
     ]
     data = self.data_wt_rename(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
예제 #11
0
    def test_fromkey_multiple_columns(self):
        df = pandas.DataFrame(data=dict(
            num1=[0, 1, 2, 3, 4, 5, 6],
            cat1=Categorical.from_codes([0, 2, 3, 1, 2, -1, 1],
                                        categories=["a", "b", "c", "d"]),
            cat2=Categorical.from_codes([2, 0, 1, 2, 0, 1, 1],
                                        categories=["e", "f", "g"]),
            num=[0, 1, 2, 3, 4, 5, 6],
            text1=["i", "j", "i", "j", "i", "j", "i"],
            text2=["k", "l", "l", "k", "k", "l", "k"]))

        concat = ColumnConcatenator() << {'textvec': ['text1', 'text2']}
        tokey = ToKey() << ['textvec']
        pipeline = Pipeline([concat, tokey])
        data_idv = pipeline.fit_transform(df)
        assert sorted(
            list(
                data_idv.columns)) == [
            'cat1',
            'cat2',
            'num',
            'num1',
            'text1',
            'text2',
            'textvec.text1',
            'textvec.text2']
        assert list(data_idv['cat1'].cat.categories) == ['a', 'b', 'c', 'd']
        assert list(data_idv['cat1'].cat.codes) == [0, 2, 3, 1, 2, -1, 1]
        assert list(data_idv['cat2'].cat.categories) == ['e', 'f', 'g']
        assert list(data_idv['cat2'].cat.codes) == [2, 0, 1, 2, 0, 1, 1]
        assert list(
            data_idv['textvec.text1'].cat.categories) == [
            'i', 'k', 'j', 'l']
        assert list(data_idv['textvec.text1'].cat.codes) == [
            0, 2, 0, 2, 0, 2, 0]
        assert list(
            data_idv['textvec.text2'].cat.categories) == [
            'i', 'k', 'j', 'l']
        assert list(data_idv['textvec.text2'].cat.codes) == [
            1, 3, 3, 1, 1, 3, 1]
예제 #12
0
 def test_syntax12_group(self):
     # This tests check that a learner raises an exception
     # if a role is not allowed by the entrypoint.
     X = pandas.DataFrame(
         dict(education=['A', 'B', 'A', 'B', 'A'],
              workclass=['X', 'X', 'Y', 'Y', 'Y'],
              gr=[0, 0, 1, 1, 1],
              y=[1.1, 2.2, 1.24, 3.4, 3.4]))
     exp = Pipeline([
         OneHotVectorizer(columns=['workclass', 'education']),
         Concat(columns={'Feature': ['workclass', 'education']}),
         ToKey() << 'gr',
         FastTreesRegressor(
             number_of_trees=5, feature='Feature', group_id='gr') << {
                 Role.Label: 'y'
             }
     ])
     exp.fit(X, verbose=0)
     assert not hasattr(exp.nodes[-1], 'feature_')
     assert not hasattr(exp.nodes[-1], 'group_id_')
     assert exp.nodes[-1].feature_column_name_ == 'Feature'
     assert exp.nodes[-1].label_column_name_ == 'y'
     # assert not hasattr(exp.nodes[-1], 'row_group_column_name_')
     assert not hasattr(exp.nodes[-1], 'group_id_column')
     assert not hasattr(exp.nodes[-1], 'groupid_column_')
     assert not hasattr(exp.nodes[-1], 'groupid_column')
     if not hasattr(exp.nodes[-1], 'row_group_column_name_'):
         raise AssertionError("Attribute not found: {0}".format(", ".join(
             sorted(dir(exp.nodes[-1])))))
     assert exp.nodes[-1].row_group_column_name_ == 'gr'
     # y is required here as well as weight.
     # It is replaced by fakes values.
     # The test does not fail but the weight is not taken into account.
     X['y'] = -5
     X['weight'] = -5
     prediction = exp.predict(X)
     assert isinstance(prediction, pandas.DataFrame)
     assert list(prediction.columns) == ['Score']
     assert prediction.shape == (5, 1)
예제 #13
0
 def check_cv_with_defaults(self,
                            label_name='Label',
                            group_id='GroupId',
                            features='Features_1',
                            **params):
     # REVIEW: Replace back ToKey() with OneHotHashVectorizer()  and reinstate metrics checks
     # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved.
     params.pop('expected_metrics', None)
     steps = [
         ToKey() << {
             group_id: group_id
         },
         # even specify all the roles needed in the following line, the
         # roles are still not passed correctly
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id,
             Role.Feature: features,
             Role.Label: label_name
         }
     ]
     data = self.data(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
예제 #14
0
 def check_cv_with_non_defaults(self,
                                label_name='label',
                                group_id='groupid',
                                features='Features_1',
                                **params):
     steps = [
         ToKey(columns={
             'groupid2': group_id,
             'label2': label_name
         }),
         LightGbmRanker() << {
             Role.GroupId: 'groupid2',
             Role.Label: 'label2',
             Role.Feature: [features]
         }
     ]
     data = self.data(label_name, group_id, features)
     cv = CV(steps)
     results = cv.fit(data, groups='groupid', cv=4)
     check_cv_results(cv._learner_type,
                      results,
                      n_folds=4,
                      expected_metrics={})
예제 #15
0
    def test_get_fit_info_ranker(self):
        file_path = get_dataset("gen_tickettrain").as_filepath()
        file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \
                      'col=Features_3:R4:3-5'
        train_stream = FileDataStream(file_path, schema=file_schema)
        pipeline = Pipeline([
            ToKey() << {
                'GroupId_2': 'GroupId_2'
            },
            ColumnConcatenator() << {
                'Features': ['Features_3']
            },
            LightGbmRanker() << {
                Role.Feature: 'Features',
                Role.Label: 'Label_1',
                Role.GroupId: 'GroupId_2'
            }
        ])

        info = pipeline.get_fit_info(train_stream)
        last = info[0][-1]
        inp = last['inputs']
        assert 'GroupId:GroupId_2' in inp
예제 #16
0
###############################################################################
# FromKey

import pandas
from nimbusml.preprocessing import FromKey, ToKey
from pandas import Categorical

# Create the data
categorical_df = pandas.DataFrame(data=dict(
    key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']),
    text=['b', 'c', 'a', 'b', 'a', 'c']))

fromkey = FromKey(columns='key')
y = fromkey.fit_transform(categorical_df)
print(y)

tokey = ToKey(columns='text')
y = tokey.fit_transform(categorical_df)
y2 = fromkey.clone().fit_transform(y)
print(y2['text'] == categorical_df['text'])
예제 #17
0
파일: ToKey.py 프로젝트: zyw400/NimbusML-1
import numpy
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import ToKey

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32,
                               names={0: 'id'})
print(data.head())
#    age  case education   id  induced  parity  pooled.stratum  spontaneous ...
# 0  26.0   1.0    0-5yrs  1.0      1.0     6.0             3.0         2.0 ...
# 1  42.0   1.0    0-5yrs  2.0      1.0     1.0             1.0         0.0 ...
# 2  39.0   1.0    0-5yrs  3.0      2.0     6.0             4.0         0.0 ...
# 3  34.0   1.0    0-5yrs  4.0      2.0     4.0             2.0         0.0  ..
# 4  35.0   1.0   6-11yrs  5.0      1.0     3.0            32.0         1.0  ..

# transform usage
xf = ToKey(columns={'id_1': 'id', 'edu_1': 'education'})

# fit and transform
features = xf.fit_transform(data)
print(features.head())
#    age  case    edu_1 education   id  id_1  induced  parity  ...
# 0  26.0   1.0   0-5yrs    0-5yrs  1.0     0      1.0     6.0 ...
# 1  42.0   1.0   0-5yrs    0-5yrs  2.0     1      1.0     1.0 ...
# 2  39.0   1.0   0-5yrs    0-5yrs  3.0     2      2.0     6.0 ...
# 3  34.0   1.0   0-5yrs    0-5yrs  4.0     3      2.0     4.0 ...
# 4  35.0   1.0  6-11yrs   6-11yrs  5.0     4      1.0     3.0 ...
예제 #18
0
###############################################################################
# FromKey
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import FromKey, ToKey

# data input (as a FileDataStream)
path = get_dataset('topics').as_filepath()

# load data
data = FileDataStream.read_csv(path, sep=',')

# transform usage
pipeline = Pipeline([
    ToKey(columns=['review_reverse']),
    FromKey(columns=['review_reverse'])
])

# fit and transform
output = pipeline.fit_transform(data)
print(output.head())
#   label                              review                   review_reverse
# 0      1  animals birds cats dogs fish horse   radiation galaxy universe duck
# 1      0    horse birds house fish duck cats  space galaxy universe radiation
# 2      1         car truck driver bus pickup                       bus pickup
# 3      0   car truck driver bus pickup horse                        car truck
# 4      1     car truck  car truck driver bus                     pickup horse
예제 #19
0
     'Sepal_Width',
     'Petal_Length',
     'Petal_Width',
     'Setosa']}),
 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
 'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                          label='Setosa'),
 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                          label='Setosa'),
 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
 'FromKey': Pipeline([
     ToKey(columns=['Sepal_Length']),
     FromKey(columns=['Sepal_Length'])
 ]),
 # GlobalContrastRowScaler currently requires a vector input to work
 'GlobalContrastRowScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']),
 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
예제 #20
0
###############################################################################
# ToKey

import pandas
from nimbusml.preprocessing import ToKey

# Create the data
text_df = pandas.DataFrame(
    data=dict(
        text=[
            "cat",
            "dog",
            "fish",
            "orange",
            "cat orange",
            "dog",
            "fish",
            "spider"]))

tokey = ToKey() << 'text'
y = tokey.fit_transform(text_df)
print(y)
# PFI for Ranking models
########################
# load input data
ticket_path = get_dataset('gen_tickettrain').as_filepath()
ranking_data = FileDataStream.read_csv(ticket_path)
print(ranking_data.head())
#    rank  group carrier  price  Class  dep_day  nbr_stops  duration
# 0     2      1      AA    240      3        1          0      12.0
# 1     1      1      AA    300      3        0          1      15.0
# 2     1      1      AA    360      3        0          2      18.0
# 3     0      1      AA    540      2        0          0      12.0
# 4     1      1      AA    600      2        0          1      15.0

# define the training pipeline with a ranker
ranking_pipeline = Pipeline([
    ToKey(columns=['group']),
    LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                   label='rank',
                   group_id='group')
])

# train the model
ranking_model = ranking_pipeline.fit(ranking_data)

# get permutation feature importance
ranking_pfi = ranking_model.permutation_feature_importance(ranking_data)

# Print PFI for each feature, ordered by most important features w.r.t. DCG@1.
# Since DCG is an increasing metric, the highest negative changes indicate the
# most important features.
print("===================== PFI for Ranking Model =====================")