def test_holidays(self):
        df = pandas.DataFrame(
            data=dict(tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13]))

        cols_to_drop = [
            'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear',
            'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso',
            'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel',
            'dtIsPaidTimeOff'
        ]

        dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
        pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
        y = pipeline.fit_transform(df)

        self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
Пример #2
0
from nimbusml import Pipeline
from nimbusml.preprocessing import DateTimeSplitter
from nimbusml.preprocessing.schema import ColumnSelector

df = pandas.DataFrame(data=dict(
    tokens1=[1, 2, 3, 157161600],
    tokens2=[10, 11, 12, 13]
))

cols_to_drop = [
    'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
    'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
    'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
    'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
]

dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'

pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
y = pipeline.fit_transform(df)

# view the three columns
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', 1000)
print(y)
#      tokens1  tokens2  dtYear  dtMonth  dtDay  dtHour  dtMinute  dtSecond  dtAmPm   dtHolidayName
# 0          1       10    1970        1      1       0         0         1       0  New Year's Day
# 1          2       11    1970        1      1       0         0         2       0  New Year's Day
# 2          3       12    1970        1      1       0         0         3       0  New Year's Day
# 3  157161600       13    1974       12     25       0         0         0       0   Christmas Day
Пример #3
0
INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
    'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
    'FromKey': Pipeline([
        ToKey(columns=['Sepal_Length']),
        FromKey(columns=['Sepal_Length'])
    ]),
    # GlobalContrastRowScaler currently requires a vector input to work
    'GlobalContrastRowScaler': Pipeline([
        ColumnConcatenator() << {
            'concated_columns': [
                'Petal_Length',
Пример #4
0
###############################################################################
# DateTimeSplitter
import pandas as pd
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import DateTimeSplitter

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',')

# transform usage
xf = DateTimeSplitter(prefix='dt_') << 'age'

# fit and transform
features = xf.fit_transform(data)

features = features.drop([
    'row_num', 'education', 'parity', 'induced', 'case', 'spontaneous',
    'stratum', 'pooled.stratum'
],
                         axis=1)

# print features
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(features.head())
#    age  dt_Year  dt_Month  dt_Day  dt_Hour  dt_Minute  dt_Second  dt_AmPm  dt_Hour12  dt_DayOfWeek  dt_DayOfQuarter  dt_DayOfYear  dt_WeekOfMonth  dt_QuarterOfYear  dt_HalfOfYear  dt_WeekIso  dt_YearIso dt_MonthLabel dt_AmPmLabel dt_DayOfWeekLabel dt_HolidayName  dt_IsPaidTimeOff
# 0   26     1970         1       1        0          0         26        0          0             4                1             0               0                 1              1           1        1970       January           am          Thursday           None                 0
# 1   42     1970         1       1        0          0         42        0          0             4                1             0               0                 1              1           1        1970       January           am          Thursday           None                 0
Пример #5
0
    'check_estimators_pickle')

OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf'

NOBINARY_CHECKS = [
    'check_estimator_sparse_data', 'check_dtype_object',
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'DateTimeSplitter':
    DateTimeSplitter(prefix='dt', columns=['F0']),
    'EnsembleClassifier':
    EnsembleClassifier(num_models=3),
    'EnsembleRegressor':
    EnsembleRegressor(num_models=3),
    'FactorizationMachineBinaryClassifier':
    FactorizationMachineBinaryClassifier(shuffle=False),
    'KMeansPlusPlus':
    KMeansPlusPlus(n_clusters=2),
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
 def test_check_estimator_DateTimeSplitter(self):
     df = pandas.DataFrame(data=dict(dt=[i for i in range(8)]))
     dt = DateTimeSplitter(prefix='dt_') << 'dt'
     result = dt.fit_transform(df)
     assert_equal(result['dt_Year'][0], 1970,
                  "it should have been year of 1970")