def test_holidays(self): df = pandas.DataFrame( data=dict(tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13])) cols_to_drop = [ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' ] dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) y = pipeline.fit_transform(df) self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
from nimbusml import Pipeline from nimbusml.preprocessing import DateTimeSplitter from nimbusml.preprocessing.schema import ColumnSelector df = pandas.DataFrame(data=dict( tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13] )) cols_to_drop = [ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' ] dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) y = pipeline.fit_transform(df) # view the three columns pandas.set_option('display.max_columns', None) pandas.set_option('display.width', 1000) print(y) # tokens1 tokens2 dtYear dtMonth dtDay dtHour dtMinute dtSecond dtAmPm dtHolidayName # 0 1 10 1970 1 1 0 0 1 0 New Year's Day # 1 2 11 1970 1 1 0 0 2 0 New Year's Day # 2 3 12 1970 1 1 0 0 3 0 New Year's Day # 3 157161600 13 1974 12 25 0 0 0 0 Christmas Day
INSTANCES = { 'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier( feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']), 'Binner': Binner(num_bins=3), 'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length',
############################################################################### # DateTimeSplitter import pandas as pd from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing import DateTimeSplitter # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',') # transform usage xf = DateTimeSplitter(prefix='dt_') << 'age' # fit and transform features = xf.fit_transform(data) features = features.drop([ 'row_num', 'education', 'parity', 'induced', 'case', 'spontaneous', 'stratum', 'pooled.stratum' ], axis=1) # print features pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) print(features.head()) # age dt_Year dt_Month dt_Day dt_Hour dt_Minute dt_Second dt_AmPm dt_Hour12 dt_DayOfWeek dt_DayOfQuarter dt_DayOfYear dt_WeekOfMonth dt_QuarterOfYear dt_HalfOfYear dt_WeekIso dt_YearIso dt_MonthLabel dt_AmPmLabel dt_DayOfWeekLabel dt_HolidayName dt_IsPaidTimeOff # 0 26 1970 1 1 0 0 26 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 # 1 42 1970 1 1 0 0 42 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0
'check_estimators_pickle') OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf' NOBINARY_CHECKS = [ 'check_estimator_sparse_data', 'check_dtype_object', 'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'DateTimeSplitter': DateTimeSplitter(prefix='dt', columns=['F0']), 'EnsembleClassifier': EnsembleClassifier(num_models=3), 'EnsembleRegressor': EnsembleRegressor(num_models=3), 'FactorizationMachineBinaryClassifier': FactorizationMachineBinaryClassifier(shuffle=False), 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=2), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor':
def test_check_estimator_DateTimeSplitter(self): df = pandas.DataFrame(data=dict(dt=[i for i in range(8)])) dt = DateTimeSplitter(prefix='dt_') << 'dt' result = dt.fit_transform(df) assert_equal(result['dt_Year'][0], 1970, "it should have been year of 1970")