def build_auto(regressor, name, **pmml_options): cat_columns = ["cylinders", "model_year", "origin"] cont_columns = ["displacement", "horsepower", "weight", "acceleration"] if isinstance(regressor, LGBMRegressor): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) if isinstance(regressor, LGBMRegressor): pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2]) elif isinstance(regressor, IsolationForest): pipeline.fit(auto_X) else: pipeline.fit(auto_X, auto_y) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) if isinstance(regressor, IsolationForest): decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"]) outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"]) outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower()) store_csv(pandas.concat((decision_function, outlier), axis = 1), name) else: mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
]) # Train the model pipeline.fit(train_data, train_labels) # Pack verification data and verify pipeline.verify(test_data) # Save the pipeline + model sklearn2pmml.sklearn2pmml(sklearn2pmml.make_pmml_pipeline( pipeline, active_fields=FEATURE_COLS, target_fields=TARGET_COL), 'model.pmml', with_repr=True, debug=True) # Measure the accuracy descision = pipeline.decision_function(test_data.sample(1)) jpmml_input_data = pd.read_csv("jpmml-test-input.csv", usecols=FEATURE_COLS) input_pred = pipeline.predict(jpmml_input_data.squeeze()) with numpy.printoptions(threshold=numpy.inf): print(input_pred) pred = pipeline.predict(test_data) print('Accuracy = {:.3f}'.format( sum(l == p for l, p in zip(test_labels, pred)) / len(test_labels))) print( 'Dominant label freq = {:.3f}'.format(1 - sum(test_labels) / len(test_labels))) print('ROC AUC = {:.3f}'.format(roc_auc_score(test_labels, pred))) print(len(train_labels), 'training data rows') print(len(pipeline[1].coef_[0]), 'model parameters')