def test_pipeline_pca(self): X = numpy.array([[1.0, 2, 3], [2, 3, 4], [3, 4, 5]]) exp = Pipeline([PcaTransformer(rank=2)]) infos = exp.get_fit_info(X)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['F0', 'F1', 'F2'], 'type': 'start', 'outputs': ['F0', 'F1', 'F2'] }, { 'name': 'TypeConverter', 'inputs': ['F0', 'F1', 'F2'], 'type': 'transform', 'outputs': ['F0', 'F1', 'F2'], 'schema_after': ['F0', 'F1', 'F2'] }, { 'name': 'PcaTransformer', 'inputs': ['temp_'], 'type': 'transform', 'outputs': ['temp_'], 'schema_after': ['F0', 'F1', 'F2', 'temp_'] }] # This id depends on id(node), different at each execution. infos[-1]["inputs"] = ["temp_"] # This id depends on id(node), different at each execution. infos[-1]["outputs"] = ["temp_"] # This id depends on id(node), different at each execution. infos[-1]["schema_after"][-1] = ["temp_"] self.assertTrue(any(x != y for x, y in zip(exp, infos)))
def test_pipeline_info(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) infos = exp.get_fit_info(df)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['education', 'workclass', 'yy'], 'type': 'start', 'outputs': ['education', 'workclass', 'yy'] }, { 'name': 'TypeConverter', 'inputs': ['yy'], 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'], 'type': 'transform' }, { 'name': 'MeanVarianceScaler', 'inputs': ['new_y'], 'type': 'transform', 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'OneHotVectorizer', 'inputs': ['workclass', 'education'], 'type': 'transform', 'outputs': ['workclass', 'education'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'ColumnDropper', 'type': 'transform', 'schema_after': ['education', 'workclass', 'new_y'], 'inputs': ['education', 'workclass', 'yy', 'new_y'], 'outputs': ['education', 'workclass', 'new_y'] }, { 'name': 'FastLinearRegressor', 'inputs': ['Feature:education,workclass', 'Label:new_y'], 'type': 'regressor', 'outputs': ['Score'], 'schema_after': ['Score'] }] if infos != exp: raise Exception(infos)
def test_ensemble_supports_get_fit_info(self): df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info r3 = LightGbmRegressor(normalize="Yes") << col_info pipeline = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], ColumnDropper() << 'yy', VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) info = pipeline.get_fit_info(df) last_info_node = info[0][-1] self.assertEqual(last_info_node['inputs'], ['Feature:education,workclass', 'Label:new_y']) self.assertEqual(last_info_node['name'], 'VotingRegressor') self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor)) self.assertEqual(last_info_node['outputs'], ['Score']) self.assertEqual(last_info_node['schema_after'], ['Score']) self.assertEqual(last_info_node['type'], 'regressor')
def test_get_fit_info_fastl(self): train_file = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(train_file) data = FileDataStream(train_file, schema) pipeline = Pipeline([ Filter(columns=['Ozone']), FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone') ]) info = pipeline.get_fit_info(data) exp = [{ 'name': None, 'outputs': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'start' }, { 'inputs': ['Ozone'], 'name': 'Filter', 'outputs': ['Ozone'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'transform' }] for el in info[0]: if 'operator' in el: del el['operator'] self.assertEqual(exp, info[0][:2])
def test_pipeline_with_no_columns(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"]) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
def test_pipeline_with_no_columns_raise(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier() ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 with self.assertRaises(RuntimeError): # Message # System.InvalidOperationException: # 'LightGBM Error, code is -1, error message is # 'Cannot construct Dataset since there are not useful features. # It should be at least two unique rows. # If the num_row (num_data) is small, # you can set min_data=1 and min_data_in_bin=1 to fix this. # Otherwise please make sure you are using the right dataset.' ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
def test_plot_fitted_cloned_pipeline(self): df = pd.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1.0, 3, 2, 3, 4])) exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], FastLinearRegressor(feature=['workclass', 'education'], label='y'), ]) info1 = exp.get_fit_info(df)[0] res1 = dot_export_pipeline(exp, df) assert res1 is not None exp.fit(df) info2 = exp.get_fit_info(df)[0] assert len(info1) == len(info2) exp.fit(df) info3 = exp.get_fit_info(df)[0] assert len(info1) == len(info3) for i, (a, b, c) in enumerate(zip(info1, info2, info3)): assert list(sorted(a)) == list(sorted(b)) assert list(sorted(a)) == list(sorted(c)) for k in sorted(a): if not isinstance(a[k], (list, dict, str, int, float, tuple)): continue if b[k] != c[k]: import pprint pprint.pprint(b) pprint.pprint(c) raise Exception( "Issue with " "op={0}\nk='{1}'\n---\n{2}\n---\n{3}".format( i, k, b[k], c[k])) if a[k] != b[k]: import pprint pprint.pprint(a) pprint.pprint(b) raise Exception( "Issue with " "op={0}\nk='{1}'\n---\n{2}\n---\n{3}".format( i, k, a[k], b[k])) res2 = dot_export_pipeline(exp, df) assert res2 is not None assert res1 == res2
def test_get_fit_info(self): transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) transform_pipeline.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) combined_pipeline.fit(train_df) info = combined_pipeline.get_fit_info(train_df) self.assertTrue(info[0][1]['name'] == 'DatasetTransformer')
def test_get_fit_info_anomaly(self): df = get_dataset("iris").as_df() df.drop(['Label', 'Setosa', 'Species'], axis=1, inplace=True) X_train, X_test = train_test_split(df) svm = Pipeline([ OneClassSvmAnomalyDetector( # noqa kernel=PolynomialKernel(a=1.0)) ]) # noqa svm.fit(X_train, verbose=0) scores = svm.predict(X_train) info = svm.get_fit_info(X_train) last = info[0][-1] out = last['outputs'] assert len(scores) == len(X_train) assert out is not None
def test_get_fit_info_clustering(self): X_train = pandas.DataFrame( data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12], y=[0, 1, 2, 10, 11, 12, -10, -11, -12], z=[0, 1, 2, 10, 11, 12, -10, -11, -12])) y_train = pandas.DataFrame(data=dict( clusterid=[0, 0, 0, 1, 1, 1, 2, 2, 2])) pipeline = Pipeline([KMeansPlusPlus(n_clusters=3)]) pipeline.fit(X_train, y_train, verbose=0) scores = pipeline.predict(X_train) info = pipeline.get_fit_info(X_train, y_train) last = info[0][-1] out = last['outputs'] assert out == ['PredictedLabel', 'Score.0', 'Score.1', 'Score.2'] assert len(scores) == 9
def test_averagedperceptron_unsupported_losses_syntax(self): df = get_dataset("infert").as_df().drop('row_num', axis=1) X = df y = df['case'] pipeline = Pipeline([ OneHotVectorizer(columns={ 'age1': 'age', 'parity1': 'parity', 'sp1': 'spontaneous' }), OneHotVectorizer(columns={'education_str': 'education_str'}), ColumnDuplicator(columns={'case2': 'case'}), AveragedPerceptronBinaryClassifier( feature=['age1', 'education_str'], label='case') ]) try: model = pipeline.fit(X, y, verbose=0) raise AssertionError("same column name in X and y") except RuntimeError as e: assert "If any step in the pipeline has defined Label" in str(e) X = X.drop('case', axis=1) pipeline = Pipeline([ OneHotVectorizer(columns={ 'age1': 'age', 'parity1': 'parity', 'sp1': 'spontaneous' }), OneHotVectorizer(columns={'education_str': 'education_str'}), # ColumnDuplicator(columns={'case2': 'case'}), # does not work AveragedPerceptronBinaryClassifier( feature=['age1', 'education_str'], label='case') ]) info = pipeline.get_fit_info(df)[0] assert info[-1]['inputs'] != ['Feature:Features', 'Label:case'] model = pipeline.fit(df) y_pred_withpipeline = model.predict(X) assert set(y_pred_withpipeline.columns) == { 'PredictedLabel', 'Probability', 'Score' } assert y_pred_withpipeline.shape == (248, 3)
def test_pipeline_info_strategy_previous_2_accumulate(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) y = df['yy'] exp = Pipeline([ OneHotVectorizer() << ['workclass'], OneHotVectorizer() << ['education'], FastLinearRegressor() ]) infos = exp.get_fit_info(X, y, iosklearn="accumulate")[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['education', 'workclass', 'yy'], 'type': 'start', 'outputs': ['education', 'workclass', 'yy'] }, { 'name': 'OneHotVectorizer', 'inputs': ['workclass'], 'type': 'transform', 'outputs': ['workclass'], 'schema_after': ['education', 'workclass', 'yy'] }, { 'name': 'OneHotVectorizer', 'inputs': ['education'], 'type': 'transform', 'outputs': ['education'], 'schema_after': ['education', 'workclass', 'yy'] }, { 'name': 'FastLinearRegressor', 'inputs': ['Feature:education,workclass', 'Label:yy'], 'type': 'regressor', 'outputs': ['Score'], 'schema_after': ['Score'] }] assert infos == exp
def test_get_fit_info_ranker(self): file_path = get_dataset("gen_tickettrain").as_filepath() file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \ 'col=Features_3:R4:3-5' train_stream = FileDataStream(file_path, schema=file_schema) pipeline = Pipeline([ ToKey() << { 'GroupId_2': 'GroupId_2' }, ColumnConcatenator() << { 'Features': ['Features_3'] }, LightGbmRanker() << { Role.Feature: 'Features', Role.Label: 'Label_1', Role.GroupId: 'GroupId_2' } ]) info = pipeline.get_fit_info(train_stream) last = info[0][-1] inp = last['inputs'] assert 'GroupId:GroupId_2' in inp
def test_syntax_onehot_trained_all_rename(self): df = pandas.DataFrame( dict(edu=['A', 'B', 'A', 'B', 'A'], wk=['X', 'X', 'Y', 'Y', 'Y'], Label=[1.1, 2.2, 1.24, 3.4, 3.4])) onehot = (OneHotVectorizer() << {'edu2': 'edu'}).fit(df, verbose=0) df2 = onehot.transform(df) lr = (FastLinearRegressor() << ['edu2.A', 'edu2.B']).fit(df2, verbose=0) pipe = Pipeline([onehot.clone(), lr.clone() << ['edu2.A', 'edu2.B']]) with self.assertRaises(RuntimeError): # 'Feature column 'edu2.A' not found pipe.fit(df, verbose=0) pipe = Pipeline([onehot.clone(), lr.clone() << ['edu2']]) try: pipe.fit(df, verbose=0) except RuntimeError: # This should work! import pprint s = pprint.pformat(pipe.get_fit_info(df)[0]) raise RuntimeError(s)