def test_transform_int(self): mapping = {(1, 1): "one", (2, 2): "two", (3, 3): "three"} transformer = MultiLookupTransformer(mapping, None) X = DataFrame([[1, 0], [1, 1], [2, 0], [2, 1], [2, 2], [3, 0], [3, 1], [3, 2], [3, 3]]) self.assertEqual([[None], ["one"], [None], [None], ["two"], [None], [None], [None], ["three"]], transformer.transform(X).tolist())
def test_transform_int(self): mapping = {(1, 1): "one", (2, 2): "two", (3, 3): "three"} transformer = MultiLookupTransformer(mapping, None) Y = DataFrame([[1, 0], [1, 1], [2, 0], [2, 1], [2, 2], [3, 0], [3, 1], [3, 2], [3, 3]]) Yt = transformer.transform(Y) self.assertEqual( [None, "one", None, None, "two", None, None, None, "three"], Yt.tolist())
def test_transform_object(self): mapping = {tuple(["zero"]) : "null", ("one", True) : "ein", ("two", True) : "zwei", ("three", True) : "drei"} with self.assertRaises(ValueError): MultiLookupTransformer(mapping, None) mapping.pop(tuple(["zero"])) transformer = MultiLookupTransformer(mapping, None) Y = DataFrame([["one", None], ["one", True], [None, True], ["two", True], ["three", True]]) Yt = transformer.transform(Y) self.assertEqual([None, "ein", None, "zwei", "drei"], Yt.tolist()) Y = numpy.matrix([["one", True], ["one", None], ["two", True]], dtype = "O") Yt = transformer.transform(Y) self.assertEqual(["ein", None, "zwei"], Yt.tolist())
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1): "8/1", (6, 1): "6/1", (4, 1): "4/1", (6, 2): "6/2", (4, 2): "4/2", (4, 3): "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value="other"), LabelBinarizer() ]), (["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name="odd(cylinders)", prefit=True)), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["model_year", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy( SelectFromModel(RandomForestRegressor(random_state=13, n_estimators=3), threshold="1.25 * mean")) ]), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = PMMLPipeline([("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name)
def build_auto(regressor, name, **pmml_options): cylinders_origin_mapping = { (8, 1): "8/1", (6, 1): "6/1", (4, 1): "4/1", (6, 2): "6/2", (4, 2): "4/2", (6, 3): "6/3", (4, 3): "4/3" } mapper = DataFrameMapper([ (["cylinders", "origin"], [ MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value="other"), LabelBinarizer() ]), (["model_year"], [CategoricalDomain(), Binarizer(threshold=77)], { "alias": "bin(model_year, 77)" }), # Pre/post 1973 oil crisis effects (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]), (["weight", "displacement"], ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), { "alias": "weight / displacement + 0.5" }) ]) pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)]) pipeline.fit(auto_X, auto_y) pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac=0.05, random_state=13), precision=1e-5, zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv")