def test_holidays(self):
        df = pandas.DataFrame(
            data=dict(tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13]))

        cols_to_drop = [
            'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear',
            'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso',
            'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel',
            'dtIsPaidTimeOff'
        ]

        dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
        pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
        y = pipeline.fit_transform(df)

        self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
예제 #2
0
from nimbusml import Pipeline
from nimbusml.preprocessing import DateTimeSplitter
from nimbusml.preprocessing.schema import ColumnSelector

df = pandas.DataFrame(data=dict(
    tokens1=[1, 2, 3, 157161600],
    tokens2=[10, 11, 12, 13]
))

cols_to_drop = [
    'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
    'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
    'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
    'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
]

dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'

pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
y = pipeline.fit_transform(df)

# view the three columns
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', 1000)
print(y)
#      tokens1  tokens2  dtYear  dtMonth  dtDay  dtHour  dtMinute  dtSecond  dtAmPm   dtHolidayName
# 0          1       10    1970        1      1       0         0         1       0  New Year's Day
# 1          2       11    1970        1      1       0         0         2       0  New Year's Day
# 2          3       12    1970        1      1       0         0         3       0  New Year's Day
# 3  157161600       13    1974       12     25       0         0         0       0   Christmas Day
예제 #3
0
    'ToKey',
    'ColumnSelector'
}

INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
    'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
    'FromKey': Pipeline([
        ToKey(columns=['Sepal_Length']),
        FromKey(columns=['Sepal_Length'])
    ]),
    # GlobalContrastRowScaler currently requires a vector input to work
    'GlobalContrastRowScaler': Pipeline([
예제 #4
0
###############################################################################
# ColumnSelector
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnSelector

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',')

# transform usage
xf = ColumnSelector(columns=['education', 'age'])

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#   age education
# 0   26    0-5yrs
# 1   42    0-5yrs
# 2   39    0-5yrs
# 3   34    0-5yrs
# 4   35   6-11yrs
예제 #5
0
data = FileDataStream(path, schema=file_schema)
print(data.head())

#    Sentiment                                      SentimentText
# 0        1.0  ==RUDE== Dude, you are rude upload that carl p...
# 1        1.0  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2        1.0  Stop trolling, zapatancas, calling me a liar m...
# 3        1.0  ==You're cool==  You seem like a really cool g...
# 4        1.0  ::::: Why are you threatening me? I'm not bein...

# After using Character Tokenizer, it will convert the vector of Char to Key type.
# Use FromKey to retrieve the data from Key first, then send into WordEmbedding.

pipe = Pipeline([
        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
        FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),
        WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}),
        ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature'])
        ])

print(pipe.fit_transform(data).head())

#    Sentiment  ... Feature.149
# 0        1.0  ...     2.67440
# 1        1.0  ...     0.78858
# 2        1.0  ...     2.67440
# 3        1.0  ...     2.67440
# 4        1.0  ...     2.67440

# [5 rows x 152 columns]
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

mycols = [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa'
]

# drop 'Species' column using ColumnDropper Transform
# select mycols for training using ColumnConcatenator transform
dropcols = ColumnDropper() << 'Species'
concat = ColumnConcatenator() << {Role.Feature: mycols}

pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)
scores1 = pipeline.predict(X_test)

# Select mycols using SelectColumns Transform
select = ColumnSelector() << mycols
pipeline.fit(X_train, y_train)
pipeline2 = Pipeline([select, LogisticRegressionClassifier()])
scores2 = pipeline.predict(X_test)

# Verify that we get identical results in both Experiments
print(scores1.head())
print(scores2.head())