def test_transform_float(self): X = [1.0, float("NaN"), 2.0, 3.0] encoder = PMMLLabelEncoder(missing_values=-999) encoder.fit(X) self.assertEqual([[0], [2], [-999], [1]], encoder.transform([1.0, 3.0, float("NaN"), 2.0]).tolist())
def test_transform_float(self): y = [1.0, float("NaN"), 2.0, 3.0] encoder = PMMLLabelEncoder(missing_value=-999) encoder.fit(y) self.assertEqual([0, 2, -999, 1], encoder.transform([1.0, 3.0, float("NaN"), 2.0]).tolist())
def test_fit_float(self): y = [1.0, float("NaN"), 1.0, 2.0, float("NaN"), 3.0, 3.0, 2.0] labels = [1.0, 2.0, 3.0] encoder = PMMLLabelEncoder(missing_value=-999) self.assertEqual(-999, encoder.missing_value) encoder.fit(y) self.assertEqual(labels, encoder.classes_.tolist())
def make_lightgbm_dataframe_mapper(dtypes, missing_value_aware=True): """Construct a DataFrameMapper for feeding complex data into a LGBMModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns ------- Tuple (DataFrameMapper, list of categorical columns indices) """ features = list() categorical_features = list() i = 0 for column, dtype in dtypes.items(): if _is_categorical(dtype): features.append(([column], PMMLLabelEncoder( missing_values=-1) if missing_value_aware else LabelEncoder())) categorical_features.append(i) else: features.append(([column], None)) i += 1 return (DataFrameMapper(features), categorical_features)
def make_lightgbm_column_transformer(dtypes, missing_value_aware=True): """Construct a ColumnTransformer for feeding complex data into a LGBMModel. Parameters ---------- dtypes: iterable of tuples (column, dtype) missing_value_aware: boolean If true, use missing value aware transformers. Returns: Tuple (ColumnTransformer, list of categorical column indices) """ transformers = list() categorical_features = list() i = 0 for column, dtype in dtypes.items(): if _is_categorical(dtype): transformers.append( (column, PMMLLabelEncoder(missing_values=-1) if missing_value_aware else OrdinalEncoder(), [column])) categorical_features.append(i) else: transformers.append((column, "passthrough", [column])) i += 1 return (ColumnTransformer(transformers, remainder="drop"), categorical_features)
def test_transform_string(self): y = ["A", None, "B", "C"] encoder = PMMLLabelEncoder() encoder.fit(y) self.assertEqual([0, 2, None, 1], encoder.transform(["A", "C", None, "B"]).tolist()) self.assertEqual([None], encoder.transform(numpy.array([None])).tolist()) self.assertEqual([0, 1, 2], encoder.transform(Series(numpy.array(["A", "B", "C"]))).tolist())
def test_transform_string(self): X = ["A", None, "B", "C"] encoder = PMMLLabelEncoder() encoder.fit(X) self.assertEqual([[0], [2], [None], [1]], encoder.transform(["A", "C", None, "B"]).tolist()) self.assertEqual([[None]], encoder.transform(numpy.array([None])).tolist()) self.assertEqual([[0], [1], [2]], encoder.transform(Series(numpy.array(["A", "B", "C"]))).tolist())
def test_fit_string(self): y = ["A", None, "A", "B", None, "C", "C", "B"] labels = ["A", "B", "C"] encoder = PMMLLabelEncoder() self.assertFalse(hasattr(encoder, "classes_")) encoder.fit(y) self.assertEqual(labels, encoder.classes_.tolist()) encoder.fit(numpy.array(y)) self.assertEqual(labels, encoder.classes_.tolist()) encoder.fit(Series(numpy.array(y))) self.assertEqual(labels, encoder.classes_.tolist())
def build_auto_na_hist(regressor, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] + [([column], [CategoricalDomain(), PMMLLabelEncoder()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name)
def build_audit_na_hist(classifier, name): mapper = DataFrameMapper( [([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] + [([column], [CategoricalDomain(), PMMLLabelEncoder()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("pipeline", Pipeline([ ("mapper", mapper), ("classifier", classifier) ])) ]) pipeline.fit(audit_na_X, audit_na_y) pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)
def label_encoder(name): return PMMLLabelEncoder() if name.endswith("NA") else LabelEncoder()
from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper binary = False data = pd.read_csv("test/support/mpg.csv") if binary: data["drv"] = data["drv"].replace("r", "4") numeric_features = ["displ", "year", "cyl"] categorical_features = ["class"] text_features = [] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LGBMClassifier(n_estimators=1000))]) pipeline.fit(data, data["drv"], model__categorical_feature=[3]) suffix = "binary" if binary else "multiclass" sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml") print(pipeline.predict(data[:10]))
from sklearn.feature_extraction.text import CountVectorizer from lightgbm import LGBMRegressor from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["drv", "class"] text_features = ["model"] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for f in text_features] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("model", LGBMRegressor(n_estimators=1000)) ]) pipeline.fit(data, data["hwy"], model__categorical_feature=[3, 4]) sklearn2pmml(pipeline, "test/support/python/lightgbm_regression.pmml") print(pipeline.predict(data[:10]))