def build_iris(classifier, name, **pmml_options): cont_columns = [ "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" ] cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns] mapper = DataFrameMapper(cont_mappings) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(n=3, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(iris_X.sample(n=3, random_state=13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) store_csv(pandas.concat((species, species_proba), axis=1), name)
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options): mapper = DataFrameMapper( [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] + [(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] + [(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] + [(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] + [(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ], predict_transformer = predict_transformer, apply_transformer = apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0} pmml_options["node_extensions"] = {regressor.criterion : node_impurity} pipeline.configure(**pmml_options) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis = 1) store_csv(mpg, name)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (6, 3) : "6/3", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"}) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, **pmml_options): cat_columns = ["cylinders", "model_year", "origin"] cont_columns = ["displacement", "horsepower", "weight", "acceleration"] if isinstance(regressor, LGBMRegressor): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) if isinstance(regressor, LGBMRegressor): pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2]) elif isinstance(regressor, IsolationForest): pipeline.fit(auto_X) else: pipeline.fit(auto_X, auto_y) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) if isinstance(regressor, IsolationForest): decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"]) outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"]) outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower()) store_csv(pandas.concat((decision_function, outlier), axis = 1), name) else: mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def build_audit(classifier, name, **pmml_options): if isinstance(classifier, LGBMClassifier): cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Income", "Hours"] else: cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"] cont_columns = ["Age", "Income", "Hours"] if isinstance(classifier, LGBMClassifier): cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns] else: cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns] cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns] mapper = DataFrameMapper(cat_mappings + cont_mappings) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) if isinstance(classifier, LGBMClassifier): pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5]) elif isinstance(classifier, XGBClassifier): if name == "XGBoostAuditNA": audit_X["Age"] = audit_X["Age"].astype(float) pipeline.fit(audit_X, audit_y) else: pipeline.fit(audit_X, audit_y) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
def build_sentiment(classifier, name, with_proba=True, **pmml_options): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_audit(mapper, classifier, name, **pmml_options): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name)
def test_configure(self): regressor = DecisionTreeRegressor() pipeline = PMMLPipeline([("regressor", regressor)]) self.assertFalse(hasattr(regressor, "pmml_options_")) pipeline.configure() self.assertFalse(hasattr(regressor, "pmml_options_")) pipeline.configure(compact=True, flat=True) self.assertTrue(hasattr(regressor, "pmml_options_")) pmml_options = regressor.pmml_options_ self.assertEqual(True, pmml_options["compact"]) self.assertEqual(True, pmml_options["flat"])
def xgboost_auto(): mapper = make_xgboost_dataframe_mapper(auto_X.dtypes, missing_value_aware=False) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", XGBRegressor(n_estimators=31, max_depth=3, random_state=13))]) pipeline.fit(auto_X, auto_y) pipeline.configure(compact=False) sklearn2pmml(pipeline, "pmml/XGBoostAuto.pmml", with_repr=True)
def xgboost_audit(): mapper = make_xgboost_dataframe_mapper(audit_X.dtypes, missing_value_aware=False) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", XGBClassifier(n_estimators=71, max_depth=5, random_state=13))]) pipeline.fit(audit_X, audit_y) pipeline.configure(compact=True) sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml", with_repr=True)
def lightgbm_audit(): mapper, categorical_feature = make_lightgbm_dataframe_mapper( audit_X.dtypes, missing_value_aware=False) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", LGBMClassifier(n_estimators=71, max_depth=7, random_state=13))]) pipeline.fit(audit_X, audit_y, classifier__categorical_feature=categorical_feature) pipeline.configure(compact=True) sklearn2pmml(pipeline, "pmml/LightGBMAudit.pmml", with_repr=False)
def lightgbm_auto(): mapper, categorical_feature = make_lightgbm_dataframe_mapper( auto_X.dtypes, missing_value_aware=False) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", LGBMRegressor(n_estimators=31, max_depth=5, random_state=13))]) pipeline.fit(auto_X, auto_y, regressor__categorical_feature=categorical_feature) pipeline.configure(compact=True) sklearn2pmml(pipeline, "pmml/LightGBMAuto.pmml", with_repr=False)
def sklearn_audit(classifier, name): pipeline = PMMLPipeline([ ("mapper", DataFrameMapper( [([column], [CategoricalDomain(), OneHotEncoder()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] + [([column], ContinuousDomain()) for column in ["Age", "Income", "Hours"]])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline.configure(compact=False) sklearn2pmml(pipeline, "pmml/" + name + ".pmml", with_repr=False)
def sklearn_auto(regressor, name): pipeline = PMMLPipeline([ ("mapper", DataFrameMapper( [([column], [CategoricalDomain(), OneHotEncoder()]) for column in ["cylinders", "model_year", "origin"]] + [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]])), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) pipeline.configure(compact=False) sklearn2pmml(pipeline, "pmml/" + name + ".pmml", with_repr=False)
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("transformer", transformer), ("densifier", DenseTransformer()), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pmml(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ CategoricalDomain(missing_values=-1), CategoricalImputer(missing_values=-1), PMMLLabelBinarizer() ]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalImputer(missing_values=-1), OneHotEncoder()])] + [(["acceleration"], [ ContinuousDomain(missing_values=None), CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels=False), CategoricalImputer(), LabelBinarizer() ])] + [(["displacement"], [ ContinuousDomain(missing_values=None), Imputer(), CutTransformer(bins=[0, 100, 200, 300, 400, 500], labels=["XS", "S", "M", "L", "XL"]), LabelBinarizer() ])] + [(["horsepower"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=50, high_value=225), Imputer() ])] + [(["weight"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=2000, high_value=5000), Imputer() ])]) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = { node_idx: tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0 } pipeline.configure( node_extensions={regressor.criterion: node_impurity}) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options): transformer = ColumnTransformer( [("all", "passthrough", ["Petal.Length", "Petal.Width"])], remainder="drop") pipeline = PMMLPipeline([("transformer", transformer), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) pipeline.configure(**pmml_options) pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13)) store_pkl(pipeline, name) species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if with_proba == True: species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability(0)", "probability(1)"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name)
def build_sentiment(classifier, tokenizer, name, with_proba = True, **pmml_options): pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, tokenizer = tokenizer, stop_words = "english", ngram_range = (1, 2), norm = None, sublinear_tf = isinstance(classifier, LogisticRegressionCV), dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))), ("count", WordCountTransformer()) ])), ("selector", SelectKBest(f_classif, k = 1000)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name)
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0, "MALE" : 1 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name + ".csv")
LabelBinarizer() ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()), (["Hours", "Income"], Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income")) ]) interaction_mapper = DataFrameMapper([ ("Gender", [CategoricalDomain(), LabelBinarizer()]), ("Marital", [CategoricalDomain(), LabelBinarizer()]) ]) classifier = XGBClassifier() pipeline = PMMLPipeline([ ("mapper", FeatureUnion([("scalar_mapper", scalar_mapper), ("interaction", Pipeline([("interaction_mapper", interaction_mapper), ("polynomial", PolynomialFeatures())]))])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) pipeline.configure(compact=True) pipeline.verify(audit_X.sample(100), zeroThreshold=1e-6, precision=1e-6) sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml") if "--deploy" in sys.argv: from openscoring import Openscoring os = Openscoring("http://localhost:8080/openscoring") os.deployFile("XGBoostAudit", "pmml/XGBoostAudit.pmml")