def test_trees_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier() << { 'Label': label_column}]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def data_wt_rename(self, label_name, group_id, features): simpleinput_file = get_dataset("gen_tickettrain").as_filepath() file_schema = 'sep=, col={label}:R4:0 col={group_id}:TX:1 ' \ 'col={features}:R4:3-5'.format( label=label_name, group_id=group_id, features=features) data = FileDataStream(simpleinput_file, schema=file_schema) if label_name != 'Label': data._set_role(Role.Label, label_name) return data
def test_linear_file_role(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) train_stream._set_role('Label', label_column) pipeline.fit(train_stream) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_schema_airquality(self): train_file = get_dataset("airquality").as_filepath() found = DataSchema.read_schema(train_file) schema = "col=Unnamed0:I8:0 col=Ozone:R8:1 col=Solar_R:R8:2 " \ "col=Wind:R8:3 col=Temp:I8:4 col=Month:I8:5 " \ "col=Day:I8:6 header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema fds = FileDataStream.read_csv(train_file) assert str(fds.schema) == schema
def test_schema_infert(self): train_file = get_dataset("infert").as_filepath() found = DataSchema.read_schema(train_file) schema = "col=row_num:I8:0 col=education:TX:1 col=age:I8:2 " \ "col=parity:I8:3 col=induced:I8:4 " + \ "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \ "col=pooled.stratum:I8:8 header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema fds = FileDataStream.read_csv(train_file) assert str(fds.schema) == schema
def test_linear_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) assert 'sep' in train_stream.schema.options assert 'header' in train_stream.schema.options pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_schema_infert_R4(self): train_file = get_dataset("infert").as_filepath() found = DataSchema.read_schema(train_file, numeric_dtype=numpy.float32) schema = "col=row_num:R4:0 col=education:TX:1 col=age:R4:2 " \ "col=parity:R4:3 col=induced:R4:4 " + \ "col=case:R4:5 col=spontaneous:R4:6 col=stratum:R4:7 " \ "col=pooled.stratum:R4:8 header=+" assert str(found) == schema fds = FileDataStream(train_file, schema) assert str(fds.schema) == schema fds = FileDataStream.read_csv(train_file, numeric_dtype=numpy.float32) assert str(fds.schema) == schema
def setUpClass(self): adult_path = get_dataset('uciadult_train').as_filepath() self.classification_data = FileDataStream.read_csv(adult_path) binary_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), LogisticRegressionBinaryClassifier(feature=['age', 'education'], label='label', number_of_threads=1) ]) self.binary_model = binary_pipeline.fit(self.classification_data) self.binary_pfi = self.binary_model.permutation_feature_importance( self.classification_data) classifier_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearClassifier(feature=['age', 'education'], label='label', number_of_threads=1, shuffle=False) ]) self.classifier_model = classifier_pipeline.fit( self.classification_data) self.classifier_pfi = self.classifier_model.permutation_feature_importance( self.classification_data) infert_path = get_dataset('infert').as_filepath() self.regression_data = FileDataStream.read_csv(infert_path) regressor_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearRegressor(feature=['induced', 'education'], label='age', number_of_threads=1, shuffle=False) ]) self.regressor_model = regressor_pipeline.fit(self.regression_data) self.regressor_pfi = self.regressor_model.permutation_feature_importance( self.regression_data) ticket_path = get_dataset('gen_tickettrain').as_filepath() self.ranking_data = FileDataStream.read_csv(ticket_path) ranker_pipeline = Pipeline([ ToKey(columns=['group']), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group', random_state=0, number_of_threads=1) ]) self.ranker_model = ranker_pipeline.fit(self.ranking_data) self.ranker_pfi = self.ranker_model.permutation_feature_importance( self.ranking_data)
def test_lightgbmranker_asfilestream(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() # Pure-nimbusml paradigm train_stream = FileDataStream.read_csv(file_path, encoding='utf-8') # pipeline pipeline = Pipeline([ # the group_id column must be of key type ToKey(columns={ 'rank': 'rank', 'group': 'group' }), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train pipeline.fit(train_stream) # test eval_stream = FileDataStream.read_csv(file_path) metrics, _ = pipeline.test(eval_stream) assert_almost_equal(metrics['NDCG@1'][0], 43.571429, decimal=5, err_msg="NDCG@1 should be %s" % 43.571429) assert_almost_equal(metrics['NDCG@2'][0], 51.28226, decimal=5, err_msg="NDCG@2 should be %s" % 51.28226) assert_almost_equal(metrics['NDCG@3'][0], 55.168069, decimal=5, err_msg="NDCG@3 should be %s" % 55.168069) assert_almost_equal(metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal(metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal(metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def test_ensemble_supports_cv_with_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } for split_start in ['before_transforms', 'after_transforms']: pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, LightGbmRegressor(**lgbm_args) ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_with_or_without_pipeline(self): # Bug 227810 # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=education:TX:1 col=Features:R4:2-4,6-8 ' \ 'col=case:R4:5 header=+' data = FileDataStream(path, schema=file_schema) # without pipeline -- fails m = LogisticRegressionBinaryClassifier(feature=['Features'], label='case') m.fit(data) scores1 = m.predict(data) # with pipeline -- works m = Pipeline([ LogisticRegressionBinaryClassifier(feature=['Features'], label='case') ]) m.fit(data) scores2 = m.predict(data) diff = np.abs(scores1.values.ravel() - scores2[['PredictedLabel']].values.ravel()) assert diff.sum() <= 2
def test_filter_no_renaming(self): with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7], Species=["setosa", "viginica", "", 'versicolor'])) tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False) file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \ 'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 ' \ 'col=Species:TX:4 header+' data = FileDataStream(tmpfile, schema=file_schema) try: xf = Filter(columns={'Petal_Length': 'Petal_Length'}) xf.fit(data) except TypeError as e: assert 'Dictionaries are not allowed to specify input ' \ 'columns.' in str( e) try: xf = Filter(columns={'Petal_Length2': 'Petal_Length'}) xf.fit(data) except TypeError as e: assert 'Dictionaries are not allowed to specify input ' \ 'columns.' in str( e)
def test_filter(self): with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7])) tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False, na_rep='?') file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \ 'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 header+' data = FileDataStream(tmpfile, schema=file_schema) xf = Filter(columns=[ 'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width' ]) features = xf.fit_transform(data) assert features.shape == (2, 4) print(features.columns) # columns ordering changed between 0.22 and 0.23 assert set(features.columns) == { 'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width' } os.remove(tmpfile)
def test_model_summary_not_supported(self): for learner in learners_not_supported: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) assert_raises(TypeError, pipeline.summary)
def test_model_summary(self): for learner in learners: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) pipeline.summary()
def test_combined_models_support_predict_proba_with_more_than_2_classes( self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) featurization_pipeline = Pipeline( [OneHotVectorizer(columns={'education': 'education'})]) featurization_pipeline.fit(data) featurized_data = featurization_pipeline.transform(data) feature_cols = ['education', 'age'] training_pipeline = Pipeline([ DatasetTransformer(featurization_pipeline.model), OneVsRestClassifier(LogisticRegressionBinaryClassifier(), feature=feature_cols, label='induced') ]) training_pipeline.fit(data, output_predictor_model=True) concat_pipeline = Pipeline( [PrefixColumnConcatenator({'education': 'education.'})]) concat_pipeline.fit(featurized_data) predictor_pipeline = Pipeline() predictor_pipeline.load_model(training_pipeline.predictor_model) concat_and_predictor_pipeline = Pipeline.combine_models( concat_pipeline, predictor_pipeline) result = concat_and_predictor_pipeline.predict_proba(featurized_data) self.assertEqual(result.shape[1], 3)
def test_get_fit_info_fastl(self): train_file = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(train_file) data = FileDataStream(train_file, schema) pipeline = Pipeline([ Filter(columns=['Ozone']), FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone') ]) info = pipeline.get_fit_info(data) exp = [{ 'name': None, 'outputs': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'start' }, { 'inputs': ['Ozone'], 'name': 'Filter', 'outputs': ['Ozone'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'transform' }] for el in info[0]: if 'operator' in el: del el['operator'] self.assertEqual(exp, info[0][:2])
def test_data_stream(self): df = pandas.DataFrame(dict(a=[0, 1], b=[0.1, 0.2])) with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: df.to_csv(f, sep=',', index=False) fi = FileDataStream.read_csv(f.name, sep=',') fi2 = fi.clone() assert repr(fi) == repr(fi2) os.remove(f.name)
def test_defaults(self): schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32) data = FileDataStream.read_csv(infert_file, schema=schema) pipeline_steps = [ OneHotVectorizer(columns={'edu': 'education'}), KMeansPlusPlus( n_clusters=5, feature=['edu', 'age', 'parity', 'spontaneous', 'stratum']) ] check_cv(pipeline_steps, data)
def test_ngramfeaturizer_single(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) xf = NGramFeaturizer(word_feature_extractor=n_gram(), columns={'features': ['id', 'education']}) features = xf.fit_transform(data) assert features.shape == (248, 652)
def test_groups(self): # one learner type is enough for testing sanity of groups argument file_schema = 'sep=, col=age:TX:2 col=Label:R4:5 ' \ 'col=Features:R4:6-8 header=+' data = FileDataStream(infert_file, schema=file_schema) expected_metrics = {'AUC': 0.704883, 'Accuracy': 0.717414} pipeline = self.pipeline(learner_arguments={'feature': 'Features'}, transforms=[]) check_cv(pipeline, data, groups='age', expected_metrics=expected_metrics)
def test_multiple_user_specified_columns_is_not_allowed(self): path = get_dataset('timeseries').as_filepath() data = FileDataStream.read_csv(path) try: pipeline = Pipeline([ IidSpikeDetector(columns=['t2', 't3'], pvalue_history_length=5) ]) pipeline.fit_transform(data) except RuntimeError as e: self.assertTrue('Only one column is allowed' in str(e)) return self.fail()
def test_different_schema_with_filedatastream_input(self): train_filename = "train-data.csv" train_df.to_csv(train_filename, index=False, header=True) train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True) test_filename = "test-data.csv" test_df.to_csv(test_filename, index=False, header=True) test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True) # Create reference pipeline std_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_data_stream) result_1 = std_pipeline.predict(test_data_stream) # Create combined pipeline transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_data_stream) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_data_stream) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_data_stream) self.assertTrue(result_1.equals(result_2)) os.remove(train_filename) os.remove(test_filename)
def test_columns_concatenator(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) xf = ColumnConcatenator( columns={'features': ['age', 'parity', 'induced']}) features = xf.fit_transform(data) assert features.shape == (248, 10) # columns ordering changed between 0.22 and 0.23 assert set(features.columns) == { 'age', 'case', 'education', 'features.age', 'features.induced', 'features.parity', 'id', 'induced', 'parity', 'spontaneous' }
def test_schema_with_vectorized_column(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) featurization_pipeline = Pipeline( [OneHotVectorizer(columns={'education': 'education'})]) featurization_pipeline.fit(data) featurized_data = featurization_pipeline.transform( data, as_binary_data_stream=True) # col=row_num:I8:0 col=education:R4:1-3 col=age:I8:4 col=parity:I8:5 # col=induced:I8:6 col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 # col=pooled.stratum:I8:10 quote+ schema = featurized_data.schema self.assertEqual(len(schema), 9) self.assertEqual(schema['age'].Type, 'I8') self.assertEqual(schema['age'].Name, 'age') self.assertEqual(schema['age'].IsVector, False) self.assertEqual(schema['education'].Type, 'R4') self.assertEqual(schema['education'].Name, 'education') self.assertEqual(len(schema['education'].Pos), 3) self.assertEqual(schema['education'].IsVector, True) self.assertTrue('education.0-5yrs' not in schema) self.assertTrue('education.6-11yrs' not in schema) self.assertTrue('education.12+yrs' not in schema) # col=row_num:I8:0 col=education.0-5yrs:R4:1 col=education.6-11yrs:R4:2 # col=education.12+yrs:R4:3 col=age:I8:4 col=parity:I8:5 col=induced:I8:6 # col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 col=pooled.stratum:I8:10 # quote+ header=+ schema = featurized_data.get_dataframe_schema() self.assertEqual(len(schema), 11) self.assertEqual(schema['age'].Type, 'I8') self.assertEqual(schema['age'].Name, 'age') self.assertEqual(schema['age'].IsVector, False) self.assertTrue('education' not in schema) self.assertTrue('education.0-5yrs' in schema) self.assertTrue('education.6-11yrs' in schema) self.assertTrue('education.12+yrs' in schema) self.assertEqual(schema['education.0-5yrs'].Type, 'R4') self.assertEqual(schema['education.0-5yrs'].Name, 'education.0-5yrs') self.assertEqual(schema['education.0-5yrs'].IsVector, False)
def test_fit_transform(self): import azureml.dataprep as dprep path = get_dataset('infert').as_filepath() dflow = dprep.auto_read_file(path=path) dprep_data = DprepDataStream(dflow) file_data = FileDataStream.read_csv(path) xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) pipe = Pipeline([xf]) transformed_data = pipe.fit_transform(file_data) transformed_data1 = pipe.fit_transform(dprep_data) assert_array_equal(transformed_data.columns, transformed_data1.columns) assert_2d_array_equal(transformed_data.values, transformed_data1.values)
def test_word_embedding_example2(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, columns={'features': ['id', 'education']}), WordEmbedding(columns='features_TransformedText') ]) features = pipeline.fit_transform(data) assert features.shape == (248, 802) assert 'features_TransformedText.94' in list(features.columns)
def test_fit_transform(self): # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) # transform usage xf = OneHotVectorizer( columns={ 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous'}) # fit and transform res1 = xf.fit_transform(data) res2 = xf.fit(data).transform(data) assert_frame_equal(res1, res2)
def test_multiple_user_specified_columns_is_not_allowed(self): path = get_dataset('timeseries').as_filepath() data = FileDataStream.read_csv(path) try: pipeline = Pipeline([ SsaForecaster(series_length=8, train_size=15, window_size=5, horizon=2, columns=['t2', 't3']) ]) pipeline.fit_transform(data) except RuntimeError as e: self.assertTrue('Only one column is allowed' in str(e)) return self.fail()
def test_word_embedding_example_dict_newname(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), # What is features_TransformedText? WordEmbedding( columns={ 'features_TransformedText2': 'features_TransformedText'}) ]) features = pipeline.fit_transform(data) assert features.shape == (248, 409)