예제 #1
0
    def test_ensemble_supports_cv_with_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        for split_start in ['before_transforms', 'after_transforms']:
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                LightGbmRegressor(**lgbm_args)
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            r1 = OrdinaryLeastSquaresRegressor(**ols_args)
            r2 = OnlineGradientDescentRegressor(**ogd_args)
            r3 = LightGbmRegressor(**lgbm_args)

            data = FileDataStream(path, schema)
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
예제 #2
0
    def test_datetime_column_parsed_from_string(self):
        dates = ["2018-01-02", "2018-02-01"]
        df = pd.DataFrame({'c1': dates, 'c2': [3, 4]})

        file_name = get_temp_file('.csv')
        df.to_csv(file_name)
        df = pd.read_csv(file_name, parse_dates=['c1'], index_col=0)

        self.assertEqual(df.dtypes[0], np.dtype('datetime64[ns]'))

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertEqual(result.loc[0, 'c1'].year, 2018)
        self.assertEqual(result.loc[0, 'c1'].month, 1)
        self.assertEqual(result.loc[0, 'c1'].day, 2)
        self.assertEqual(result.loc[0, 'c1'].hour, 0)
        self.assertEqual(result.loc[0, 'c1'].minute, 0)
        self.assertEqual(result.loc[0, 'c1'].second, 0)

        self.assertEqual(result.loc[1, 'c1'].year, 2018)
        self.assertEqual(result.loc[1, 'c1'].month, 2)
        self.assertEqual(result.loc[1, 'c1'].day, 1)
        self.assertEqual(result.loc[1, 'c1'].hour, 0)
        self.assertEqual(result.loc[1, 'c1'].minute, 0)
        self.assertEqual(result.loc[1, 'c1'].second, 0)

        self.assertEqual(len(result), 2)
        self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]'))

        os.remove(file_name)
 def test_input_types(self):
     df = DataFrame(data=dict(Label=[1, 2, 3, 4, 5],
                              f=[1.1, 2.2, 3.3, np.nan, 5.5],
                              f1=[2.2, np.nan, 4.4, 5.5, 6.6]))
     h = Handler(replace_with='Mean')
     ft = FastLinearRegressor(shuffle=False, number_of_threads=1)
     p = Pipeline([h, ft])
     p.fit(df[['f', 'f1']].values, df['Label'])
     res = p.predict(df[['f', 'f1']].values)
     print(res)
     print(p.summary())
     assert_allclose(res['Score'].values,
                     [4.965541, 0.519701, 4.992831, 3.877400, 5.020121],
                     rtol=1e-4)
    def test_input_conversion_to_float_retains_other_column_types(self):
        data = {
            'f0': [0, 1, 2, 3],
            'f1': ['2', '3', '4', '5'],
            'f2': [4, 5, np.nan, 9]
        }

        data = DataFrame(data).astype({
            'f0': np.int32,
            'f1': str,
            'f2': np.float64
        })

        # Check Indicator
        xf = Indicator(columns={'f2.ind': 'f2'})
        result = xf.fit_transform(data)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.object)
        assert_equal(result.dtypes['f2'], np.float64)
        assert_equal(result.dtypes['f2.ind'], np.bool)
        assert_equal(result.loc[2, 'f2.ind'], True)
        assert_equal(len(result), 4)

        # Check Filter
        xf = Filter(columns=['f2'])
        result = xf.fit_transform(data)
        assert_equal(len(result), 3)
        assert_equal(result.loc[2, 'f2'], 9.0)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.object)
        assert_equal(result.dtypes['f2'], np.float32)

        xf = Filter(columns=['f1'])
        result = xf.fit_transform(data)
        assert_equal(len(result), 4)
        assert_equal(result.loc[3, 'f2'], 9.0)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.float32)
        assert_equal(result.dtypes['f2'], np.float64)

        # Check Handler
        xf = Handler(columns=['f2'], replace_with='Mean')
        result = xf.fit_transform(data)
        assert_equal(len(result), 4)
        assert_equal(result.loc[2, 'f2.f2'], 6.0)
        assert_equal(result.dtypes['f0'], np.int32)
        assert_equal(result.dtypes['f1'], np.object)
        assert_equal(result.dtypes['f2.f2'], np.float32)
예제 #5
0
    def test_dprep_datastream(self):
        import azureml.dataprep as dprep

        dates = ["2018-01-02 00:00:00", "2018-02-01 10:00:00"]
        col2 = ['0', '1']
        label_array = np.repeat([0], 2)
        train_df = pd.DataFrame({
            'col1': dates,
            'col2': col2,
            'label': label_array
        })

        pipeline = Pipeline(steps=[
            Handler(columns={'2': 'col2'},
                    concat=False,
                    impute_by_slot=True,
                    replace_with='Mean')
        ])

        file_name = get_temp_file('.csv')
        train_df.to_csv(file_name)

        dataflow = dprep.read_csv(file_name, infer_column_types=True)
        dprepDataStream = DprepDataStream(dataflow)

        result = pipeline.fit_transform(dprepDataStream)

        self.assertEqual(result.loc[:, 'col1'].dtype,
                         np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'col1'].year, 2018)
        self.assertEqual(result.loc[0, 'col1'].month, 1)
        self.assertEqual(result.loc[0, 'col1'].day, 2)
        self.assertEqual(result.loc[0, 'col1'].hour, 0)
        self.assertEqual(result.loc[0, 'col1'].minute, 0)
        self.assertEqual(result.loc[0, 'col1'].second, 0)

        self.assertEqual(result.loc[1, 'col1'].year, 2018)
        self.assertEqual(result.loc[1, 'col1'].month, 2)
        self.assertEqual(result.loc[1, 'col1'].day, 1)
        self.assertEqual(result.loc[1, 'col1'].hour, 10)
        self.assertEqual(result.loc[1, 'col1'].minute, 0)
        self.assertEqual(result.loc[1, 'col1'].second, 0)

        os.remove(file_name)
예제 #6
0
    def test_negative_values(self):
        milliseconds_in_year = 365 * 24 * 60 * 60 * 1000
        data = [i * milliseconds_in_year for i in [-1, -2, -3, -3.3]]

        df = pd.DataFrame({'c1': data, 'c2': [3, 4, 5, 6]})
        df = df.astype({'c1': np.dtype('datetime64[ms]')})

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1']))
        self.assertEqual(result.loc[:, 'c1'].dtype, np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'c1'].year, 1969)
        self.assertEqual(result.loc[0, 'c1'].hour, 0)
        self.assertEqual(result.loc[0, 'c1'].minute, 0)
        self.assertEqual(result.loc[0, 'c1'].second, 0)

        self.assertEqual(result.loc[3, 'c1'].year, 1966)
    def test_input_conversion_to_float(self):
        data = {
            'f0': [0, 1, 2, 3],
            'f1': [1, 2, 3, 4],
            'f2': [1, 2, 3, 4],
            'f3': [1, 2, 3, 4],
            'f4': ['2', '3', '4', '5'],
            'f5': [4, 5, np.nan, 9]
        }

        data = DataFrame(data).astype({
            'f0': np.int8,
            'f1': np.int16,
            'f2': np.int32,
            'f3': np.int64,
            'f4': str,
            'f5': np.float64
        })

        # Check Indicator
        xf = Indicator()
        result = xf.fit_transform(data)

        assert_equal(result.loc[2, 'f5'], True)
        result.loc[2, 'f5'] = False
        result = ~result
        for val in result.all().tolist():
            self.assertTrue(val)

        # Check Filter
        xf = Filter()
        result = xf.fit_transform(data)
        assert_equal(len(result), 3)
        assert_equal(result.loc[2, 'f5'], 9.0)

        # Check Handler
        xf = Handler(replace_with='Mean')
        result = xf.fit_transform(data)
        assert_equal(len(result), 4)
        assert_equal(result.loc[2, 'f5.f5'], 6.0)
        assert_equal(result.loc[2, 'f5.IsMissing.f5'], 1.0)
예제 #8
0
    def test_timestamp_boundaries(self):
        # Here are the current min and max for a Pandas Timestamp
        # 1677-09-21 00:12:43.145225
        # 2262-04-11 23:47:16.854775807

        data = [pd.Timestamp(1677, 9, 22, 1), pd.Timestamp.max]
        df = pd.DataFrame({'c1': data, 'c2': [3, 4]})
        df = df.astype({'c1': np.dtype('datetime64[ms]')})

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1']))
        self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'c1'].year, 1677)
        self.assertEqual(result.loc[0, 'c1'].month, 9)
        self.assertEqual(result.loc[0, 'c1'].day, 22)

        self.assertEqual(result.loc[1, 'c1'].year, 2262)
        self.assertEqual(result.loc[1, 'c1'].month, 4)
        self.assertEqual(result.loc[1, 'c1'].day, 11)
예제 #9
0
    def test_split_start_with_transforms_with_presteps(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        pipeline_steps = [
            Indicator() << {
                'Ozone_ind': 'Ozone',
                'Solar_R_ind': 'Solar_R'
            },
            Handler(replace_with='Mean') << {
                'Solar_R': 'Solar_R',
                'Ozone': 'Ozone'
            },
            LightGbmRegressor(feature=[
                'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'
            ],
                              label='Wind')
        ]

        results = CV(pipeline_steps).fit(data,
                                         split_start='after_transforms',
                                         dry_run=True)
        results = json.loads(results)

        node_names = [ep['Name'] for ep in results['nodes']]
        cv_node = [
            ep for ep in results['nodes']
            if 'Models.CrossValidator' in ep['Name']
        ][0]
        cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']]

        self.assertTrue('Transforms.MissingValueHandler' in node_names)
        self.assertTrue(
            'Transforms.MissingValueHandler' not in cv_sub_node_names)
        self.assertTrue('Transforms.ModelCombiner' in node_names)
예제 #10
0
    def test_performance_syntax(self):
        train_file = get_dataset('uciadult_train').as_filepath()
        test_file = get_dataset('uciadult_test').as_filepath()
        file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 ' \
                      'col=workclass:TX:1 col=education:TX:2 ' \
                      'col=marital-status:TX:3 col=occupation:TX:4 ' \
                      'col=relationship:TX:5 col=ethnicity:TX:6 ' \
                      'col=sex:TX:7 col=native-country-region:TX:8 header+'
        categorical_columns = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'ethnicity', 'sex', 'native-country-region'
        ]
        label_column = 'label'
        na_columns = ['Features']
        feature_columns_idv = na_columns + categorical_columns

        exp = Pipeline([
            OneHotHashVectorizer(columns=categorical_columns),
            Handler(columns=na_columns),
            FastLinearBinaryClassifier(feature=feature_columns_idv,
                                       label=label_column)
        ])

        train_data = FileDataStream(train_file, schema=file_schema)
        exp.fit(train_data, label_column, verbose=0)
        print("train time %s" % exp._run_time)

        test_data = FileDataStream(test_file, schema=file_schema)
        out_data = exp.predict(test_data)
        print("predict time %s" % exp._run_time)

        (test, label_test) = get_X_y(test_file, label_column, sep=',')
        (acc1, auc1) = evaluate_binary_classifier(
            label_test.iloc[:, 0].values,
            out_data.loc[:, 'PredictedLabel'].values,
            out_data.loc[:, 'Probability'].values)

        print('ACC %s, AUC %s' % (acc1, auc1))

        exp = Pipeline([
            OneHotHashVectorizer() << categorical_columns,
            Handler() << na_columns,
            FastLinearBinaryClassifier() << feature_columns_idv
        ])

        train_data = FileDataStream(train_file, schema=file_schema)
        exp.fit(train_data, label_column, verbose=0)
        print("train time %s" % exp._run_time)

        test_data = FileDataStream(test_file, schema=file_schema)
        out_data = exp.predict(test_data)
        print("predict time %s" % exp._run_time)

        (test, label_test) = get_X_y(test_file, label_column, sep=',')
        (acc2, auc2) = evaluate_binary_classifier(
            label_test.iloc[:, 0].values,
            out_data.loc[:, 'PredictedLabel'].values,
            out_data.loc[:, 'Probability'].values)
        print('ACC %s, AUC %s' % (acc2, auc2))
        assert abs(acc1 - acc2) < 0.02
        assert abs(auc1 - auc2) < 0.02
예제 #11
0
 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
 'FromKey': Pipeline([
     ToKey(columns=['Sepal_Length']),
     FromKey(columns=['Sepal_Length'])
 ]),
 # GlobalContrastRowScaler currently requires a vector input to work
 'GlobalContrastRowScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']),
 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']),
 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                                  label='rank',
                                  group_id='group'),
 'Loader': Loader(columns={'ImgPath': 'Path'}),
 'LpScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
예제 #12
0
파일: CV.py 프로젝트: zyw400/NimbusML-1
cm = cv_results['confusion_matrix']
print(cm[cm.Fold == 1])

# Case 2: Using CV with split_start option

path = get_dataset("airquality").as_filepath()
schema = DataSchema.read_schema(path)
data = FileDataStream(path, schema)

# CV also accepts the list of pipeline steps directly as input
pipeline_steps = [
    Indicator() << {
        'Ozone_ind': 'Ozone',
        'Solar_R_ind': 'Solar_R'
    },
    Handler(replace_with='Mean') << {
        'Solar_R': 'Solar_R',
        'Ozone': 'Ozone'
    },
    FastLinearRegressor(
        feature=['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
        label='Wind')
]

# Since the Indicator and Handler transforms don't learn from data,
# they could be run once before splitting the data into folds, instead of
# repeating them once per fold. We use 'split_start=after_transforms' option
# to achieve this optimization.
cv_results = CV(pipeline_steps).fit(data, split_start='after_transforms')

# Results can be accessed the same way as in Case 1 above.
예제 #13
0
from nimbusml import FileDataStream
from nimbusml.preprocessing.missing_values import Handler

with_nans = pd.DataFrame(
    data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
              Sepal_Width=[.75, .9, .8, .76],
              Petal_Length=[np.nan, 2.5, 2.6, 2.4],
              Petal_Width=[.8, .7, .9, 0.7],
              Species=["setosa", "viginica", "", 'versicolor']))

# write NaNs to file to show how this transform work
tmpfile = 'tmpfile_with_nans.csv'
with_nans.to_csv(tmpfile, index=False)

data = FileDataStream.read_csv(tmpfile, sep=',', numeric_dtype=np.float32)

# transform usage
xf = Handler(columns={'PL': 'Petal_Length'})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())

#   PL.IsMissing.Petal_Length  PL.Petal_Length  Petal_Length  Petal_Width  ...
# 0                        1.0              0.0           NaN          0.8  ...
# 1                        0.0              2.5           2.5          0.7  ...
# 2                        0.0              2.6           2.6          0.9  ...
# 3                        0.0              2.4           2.4          0.7  ...
예제 #14
0
from nimbusml import FileDataStream
from nimbusml.preprocessing.missing_values import Handler

with_nans = pd.DataFrame(
    data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
              Sepal_Width=[.75, .9, .8, .76],
              Petal_Length=[np.nan, 2.5, 2.6, 2.4],
              Petal_Width=[.8, .7, .9, 0.7],
              Species=["setosa", "viginica", "", 'versicolor']))

# Write NaNs to file to see how transforms work
tmpfile = 'tmpfile_with_nans.csv'
with_nans.to_csv(tmpfile, index=False)

# schema for reading directly from text files
schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \
         'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 col=Species:TX:4 header+'
data = FileDataStream.read_csv(tmpfile, collapse=True)
print(data.schema)

# Handler
# Creates 2 new columns,
# - 'Sepal_length.1' containing imputed values
# - 'IsMissing.Sepal_Length' flag indicating if the value was imputed
# replace_with is one of ['Mean', 'Max', 'Min', 'Def']
nahandle = Handler(replace_with='Mean') << {'NewVals': 'Sepal_Length'}

print(with_nans)
data = FileDataStream(tmpfile, schema)
print('NAHandle\n', nahandle.fit_transform(data))