Exemplo n.º 1
0
 def test_transform_float(self):
     X = [1.0, float("NaN"), 2.0, 3.0]
     encoder = PMMLLabelEncoder(missing_values=-999)
     encoder.fit(X)
     self.assertEqual([[0], [2], [-999], [1]],
                      encoder.transform([1.0, 3.0,
                                         float("NaN"), 2.0]).tolist())
Exemplo n.º 2
0
 def test_transform_float(self):
     y = [1.0, float("NaN"), 2.0, 3.0]
     encoder = PMMLLabelEncoder(missing_value=-999)
     encoder.fit(y)
     self.assertEqual([0, 2, -999, 1],
                      encoder.transform([1.0, 3.0,
                                         float("NaN"), 2.0]).tolist())
Exemplo n.º 3
0
 def test_fit_float(self):
     y = [1.0, float("NaN"), 1.0, 2.0, float("NaN"), 3.0, 3.0, 2.0]
     labels = [1.0, 2.0, 3.0]
     encoder = PMMLLabelEncoder(missing_value=-999)
     self.assertEqual(-999, encoder.missing_value)
     encoder.fit(y)
     self.assertEqual(labels, encoder.classes_.tolist())
Exemplo n.º 4
0
def make_lightgbm_dataframe_mapper(dtypes, missing_value_aware=True):
    """Construct a DataFrameMapper for feeding complex data into a LGBMModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns
	-------
	Tuple (DataFrameMapper, list of categorical columns indices)

	"""
    features = list()
    categorical_features = list()
    i = 0
    for column, dtype in dtypes.items():
        if _is_categorical(dtype):
            features.append(([column], PMMLLabelEncoder(
                missing_values=-1) if missing_value_aware else LabelEncoder()))
            categorical_features.append(i)
        else:
            features.append(([column], None))
        i += 1
    return (DataFrameMapper(features), categorical_features)
Exemplo n.º 5
0
def make_lightgbm_column_transformer(dtypes, missing_value_aware=True):
    """Construct a ColumnTransformer for feeding complex data into a LGBMModel.

	Parameters
	----------

	dtypes: iterable of tuples (column, dtype)

	missing_value_aware: boolean
		If true, use missing value aware transformers.

	Returns:
	Tuple (ColumnTransformer, list of categorical column indices)

	"""
    transformers = list()
    categorical_features = list()
    i = 0
    for column, dtype in dtypes.items():
        if _is_categorical(dtype):
            transformers.append(
                (column, PMMLLabelEncoder(missing_values=-1)
                 if missing_value_aware else OrdinalEncoder(), [column]))
            categorical_features.append(i)
        else:
            transformers.append((column, "passthrough", [column]))
        i += 1
    return (ColumnTransformer(transformers,
                              remainder="drop"), categorical_features)
Exemplo n.º 6
0
	def test_transform_string(self):
		y = ["A", None, "B", "C"]
		encoder = PMMLLabelEncoder()
		encoder.fit(y)
		self.assertEqual([0, 2, None, 1], encoder.transform(["A", "C", None, "B"]).tolist())
		self.assertEqual([None], encoder.transform(numpy.array([None])).tolist())
		self.assertEqual([0, 1, 2], encoder.transform(Series(numpy.array(["A", "B", "C"]))).tolist())
Exemplo n.º 7
0
	def test_transform_string(self):
		X = ["A", None, "B", "C"]
		encoder = PMMLLabelEncoder()
		encoder.fit(X)
		self.assertEqual([[0], [2], [None], [1]], encoder.transform(["A", "C", None, "B"]).tolist())
		self.assertEqual([[None]], encoder.transform(numpy.array([None])).tolist())
		self.assertEqual([[0], [1], [2]], encoder.transform(Series(numpy.array(["A", "B", "C"]))).tolist())
Exemplo n.º 8
0
 def test_fit_string(self):
     y = ["A", None, "A", "B", None, "C", "C", "B"]
     labels = ["A", "B", "C"]
     encoder = PMMLLabelEncoder()
     self.assertFalse(hasattr(encoder, "classes_"))
     encoder.fit(y)
     self.assertEqual(labels, encoder.classes_.tolist())
     encoder.fit(numpy.array(y))
     self.assertEqual(labels, encoder.classes_.tolist())
     encoder.fit(Series(numpy.array(y)))
     self.assertEqual(labels, encoder.classes_.tolist())
Exemplo n.º 9
0
def build_auto_na_hist(regressor, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] +
		[([column], [CategoricalDomain(), PMMLLabelEncoder()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 10
0
def build_audit_na_hist(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], [CategoricalDomain(), PMMLLabelEncoder()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("pipeline", Pipeline([
			("mapper", mapper),
			("classifier", classifier)
		]))
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 11
0
def label_encoder(name):
	return PMMLLabelEncoder() if name.endswith("NA") else LabelEncoder()
Exemplo n.º 12
0
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

binary = False

data = pd.read_csv("test/support/mpg.csv")
if binary:
    data["drv"] = data["drv"].replace("r", "4")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), PMMLLabelEncoder()])
     for f in categorical_features] +
    [(f, [CategoricalDomain(),
          CountVectorizer(tokenizer=Splitter())]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper),
                         ("model", LGBMClassifier(n_estimators=1000))])
pipeline.fit(data, data["drv"], model__categorical_feature=[3])

suffix = "binary" if binary else "multiclass"
sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml")

print(pipeline.predict(data[:10]))
Exemplo n.º 13
0
from sklearn.feature_extraction.text import CountVectorizer
from lightgbm import LGBMRegressor
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["drv", "class"]
text_features = ["model"]

mapper = DataFrameMapper(
  [(numeric_features, [ContinuousDomain()])] +
  [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] +
  [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for f in text_features]
)

pipeline = PMMLPipeline([
  ("mapper", mapper),
  ("model", LGBMRegressor(n_estimators=1000))
])
pipeline.fit(data, data["hwy"], model__categorical_feature=[3, 4])

sklearn2pmml(pipeline, "test/support/python/lightgbm_regression.pmml")

print(pipeline.predict(data[:10]))