예제 #1
0
def build_auto(regressor, name, **pmml_options):
	cat_columns = ["cylinders", "model_year", "origin"]
	cont_columns = ["displacement", "horsepower", "weight", "acceleration"]
	if isinstance(regressor, LGBMRegressor):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	if isinstance(regressor, LGBMRegressor):
		pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2])
	elif isinstance(regressor, IsolationForest):
		pipeline.fit(auto_X)
	else:
		pipeline.fit(auto_X, auto_y)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	if isinstance(regressor, IsolationForest):
		decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"])
		outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"])
		outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower())
		store_csv(pandas.concat((decision_function, outlier), axis = 1), name)
	else:
		mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
		store_csv(mpg, name)
])
# Train the model
pipeline.fit(train_data, train_labels)

# Pack verification data and verify
pipeline.verify(test_data)

# Save the pipeline + model
sklearn2pmml.sklearn2pmml(sklearn2pmml.make_pmml_pipeline(
    pipeline, active_fields=FEATURE_COLS, target_fields=TARGET_COL),
                          'model.pmml',
                          with_repr=True,
                          debug=True)

# Measure the accuracy
descision = pipeline.decision_function(test_data.sample(1))

jpmml_input_data = pd.read_csv("jpmml-test-input.csv", usecols=FEATURE_COLS)
input_pred = pipeline.predict(jpmml_input_data.squeeze())
with numpy.printoptions(threshold=numpy.inf):
    print(input_pred)

pred = pipeline.predict(test_data)
print('Accuracy = {:.3f}'.format(
    sum(l == p for l, p in zip(test_labels, pred)) / len(test_labels)))
print(
    'Dominant label freq = {:.3f}'.format(1 -
                                          sum(test_labels) / len(test_labels)))
print('ROC AUC = {:.3f}'.format(roc_auc_score(test_labels, pred)))
print(len(train_labels), 'training data rows')
print(len(pipeline[1].coef_[0]), 'model parameters')