def test_transform_float(self): y = [1.0, float("NaN"), 2.0, 3.0] binarizer = PMMLLabelBinarizer() binarizer.fit(y) self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]], binarizer.transform([1.0, 3.0, float("NaN"), 2.0]).tolist())
def test_transform_float(self): X = [1.0, float("NaN"), 2.0, 3.0] dense_binarizer = PMMLLabelBinarizer() dense_binarizer.fit(X) Xt_dense = dense_binarizer.transform([1.0, 3.0, float("NaN"), 2.0]) self.assertIsInstance(Xt_dense, numpy.ndarray) self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]], Xt_dense.tolist()) sparse_binarizer = PMMLLabelBinarizer(sparse_output = True) sparse_binarizer.fit(X) Xt_sparse = sparse_binarizer.transform([1.0, 3.0, float("NaN"), 2.0]) self.assertIsInstance(Xt_sparse, scipy.sparse.csr_matrix) self.assertEqual(Xt_dense.tolist(), Xt_sparse.toarray().tolist())
def make_xgboost_column_transformer(dtypes, missing_value_aware=True): """Construct a ColumnTransformer for feeding complex data into an XGBModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns ------- ColumnTransformer """ transformers = list() for column, dtype in dtypes.items(): if _is_categorical(dtype): transformers.append( (column, PMMLLabelBinarizer(sparse_output=True) if missing_value_aware else Pipeline([("ordinal_encoder", OrdinalEncoder()), ("one_hot_encoder", OneHotEncoder())]), [column])) else: transformers.append((column, "passthrough", [column])) return ColumnTransformer(transformers, remainder="drop")
def make_xgboost_dataframe_mapper(dtypes, missing_value_aware=True): """Construct a DataFrameMapper for feeding complex data into an XGBModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns ------- DataFrameMapper """ features = list() for column, dtype in dtypes.items(): if _is_categorical(dtype): features.append(([column], PMMLLabelBinarizer( sparse_output=True) if missing_value_aware else LabelBinarizer( sparse_output=True))) else: features.append(([column], None)) return DataFrameMapper(features)
def build_auto_na(regressor, name, predict_transformer=None, apply_transformer=None, **pmml_options): mapper = DataFrameMapper( [([column], [ CategoricalDomain(missing_values=-1), CategoricalImputer(missing_values=-1), PMMLLabelBinarizer() ]) for column in ["cylinders", "model_year"]] + [(["origin"], [CategoricalImputer(missing_values=-1), OneHotEncoder()])] + [(["acceleration"], [ ContinuousDomain(missing_values=None), CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels=False), CategoricalImputer(), LabelBinarizer() ])] + [(["displacement"], [ ContinuousDomain(missing_values=None), Imputer(), CutTransformer(bins=[0, 100, 200, 300, 400, 500], labels=["XS", "S", "M", "L", "XL"]), LabelBinarizer() ])] + [(["horsepower"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=50, high_value=225), Imputer() ])] + [(["weight"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_extreme_values", low_value=2000, high_value=5000), Imputer() ])]) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)], predict_transformer=predict_transformer, apply_transformer=apply_transformer) pipeline.fit(auto_na_X, auto_na_y) if isinstance(regressor, DecisionTreeRegressor): tree = regressor.tree_ node_impurity = { node_idx: tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0 } pmml_options["node_extensions"] = {regressor.criterion: node_impurity} pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"]) if isinstance(regressor, DecisionTreeRegressor): Xt = pipeline_transform(pipeline, auto_na_X) mpg_apply = DataFrame(regressor.apply(Xt), columns=["nodeId"]) mpg = pandas.concat((mpg, mpg_apply), axis=1) store_csv(mpg, name + ".csv")
def test_transform_string(self): y = ["A", None, "B", "C"] binarizer = PMMLLabelBinarizer() binarizer.fit(y) self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]], binarizer.transform(["A", "C", None, "B"]).tolist()) self.assertEqual([[0, 0, 0]], binarizer.transform([None]).tolist()) self.assertEqual([[1, 0, 0], [0, 1, 0], [0, 0, 1]], binarizer.transform(["A", "B", "C"]).tolist())
def test_fit_string(self): y = ["A", None, "A", "B", None, "C", "C", "B"] labels = ["A", "B", "C"] binarizer = PMMLLabelBinarizer() self.assertFalse(hasattr(binarizer, "classes_")) binarizer.fit(y) self.assertEqual(labels, binarizer.classes_.tolist()) binarizer.fit(numpy.array(y)) self.assertEqual(labels, binarizer.classes_.tolist()) binarizer.fit(Series(numpy.array(y))) self.assertEqual(labels, binarizer.classes_.tolist())
def build_audit_na(classifier, name, with_proba=True): employment_mapping = { "Consultant": "Private", "PSFederal": "Public", "PSLocal": "Public", "PSState": "Public", "SelfEmp": "Private", "Private": "Private" } gender_mapping = {"Female": 0, "Male": 1} mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values=None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [("Employment", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(employment_mapping, "Other"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None), CategoricalImputer(), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [("Gender", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_auto_na_hist(regressor, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name)
def build_auto_na(regressor, name): mapper = DataFrameMapper([ ([column], [ContinuousDomain(missing_values=None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"] ] + [([column], [ CategoricalDomain(missing_values=-1), CategoricalImputer(missing_values=-1), PMMLLabelBinarizer() ]) for column in ["cylinders", "model_year", "origin"]]) pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_na_X, auto_na_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"]) store_csv(mpg, name + ".csv")
def build_audit_na(classifier, name, with_proba = True): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_audit_na_hist(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def label_binarizer(name): return PMMLLabelBinarizer() if name.endswith("NA") else LabelBinarizer()
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options): employment_mapping = { "CONSULTANT" : "PRIVATE", "PSFEDERAL" : "PUBLIC", "PSLOCAL" : "PUBLIC", "PSSTATE" : "PUBLIC", "SELFEMP" : "PRIVATE", "PRIVATE" : "PRIVATE" } gender_mapping = { "FEMALE" : 0.0, "MALE" : 1.0, "MISSING_VALUE" : 0.5 } mapper = DataFrameMapper( [(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] + [(["Age"], MissingIndicator())] + [(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] + [(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] + [(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] + [([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer) pipeline.fit(audit_na_X, audit_na_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1) store_csv(adjusted, name)
max_depth=3, random_state=13, categorical_feature=lightgbm_categorical_feature)) ]) xgboost_mapper = make_xgboost_column_transformer(dtypes, missing_value_aware=True) xgboost_pipeline = Pipeline([("mapper", xgboost_mapper), ("classifier", XGBClassifier(n_estimators=31, learning_rate=0.1, max_depth=3, random_state=13))]) sklearn_mapper = ColumnTransformer( [(str(cat_index), PMMLLabelBinarizer(sparse_output=False), [cat_index]) for cat_index in range(0, len(cat_columns))] + [(str(cont_index), "passthrough", [cont_index]) for cont_index in range(len(cat_columns), len(cat_columns + cont_columns)) ], remainder="drop") sklearn_pipeline = Pipeline([("mapper", sklearn_mapper), ("classifier", HistGradientBoostingClassifier(max_iter=31, max_depth=3, random_state=13)) ]) final_estimator = LogisticRegression(multi_class="ovr", random_state=13)
def build_audit_na(classifier, name, with_proba=True, predict_proba_transformer=None, apply_transformer=None, **pmml_options): employment_mapping = { "CONSULTANT": "PRIVATE", "PSFEDERAL": "PUBLIC", "PSLOCAL": "PUBLIC", "PSSTATE": "PUBLIC", "SELFEMP": "PRIVATE", "PRIVATE": "PRIVATE" } gender_mapping = {"FEMALE": 0, "MALE": 1} mapper = DataFrameMapper([(["Age"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"), name="flag_missing(Age, -999)"), Imputer(missing_values=-999) ])] + [(["Hours"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"), name="flag_missing(Hours, -999)"), Imputer(missing_values=-999) ])] + [(["Income"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_missing_values", low_value=5000, high_value=200000, with_data=False), Imputer() ])] + [(["Employment"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(missing_values=None), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline( [("mapper", mapper), ("classifier", classifier)], predict_proba_transformer=predict_proba_transformer, apply_transformer=apply_transformer) pipeline.fit(audit_na_X, audit_na_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv")
def test_transform_string(self): X = ["A", None, "B", "C"] dense_binarizer = PMMLLabelBinarizer() dense_binarizer.fit(X) Xt_dense = dense_binarizer.transform(["A", "C", None, "B"]) self.assertIsInstance(Xt_dense, numpy.ndarray) self.assertEqual([[1, 0, 0], [0, 0, 1], [0, 0, 0], [0, 1, 0]], Xt_dense.tolist()) self.assertEqual([[0, 0, 0]], dense_binarizer.transform([None]).tolist()) self.assertEqual([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dense_binarizer.transform(["A", "B", "C"]).tolist()) sparse_binarizer = PMMLLabelBinarizer(sparse_output=True) sparse_binarizer.fit(X) Xt_sparse = sparse_binarizer.transform(["A", "C", None, "B"]) self.assertIsInstance(Xt_sparse, scipy.sparse.csr_matrix) self.assertEqual(Xt_dense.tolist(), Xt_sparse.toarray().tolist())
def test_fit_float(self): y = [1.0, float("NaN"), 1.0, 2.0, float("NaN"), 3.0, 3.0, 2.0] labels = [1.0, 2.0, 3.0] binarizer = PMMLLabelBinarizer() binarizer.fit(y) self.assertEqual(labels, binarizer.classes_.tolist())