示例#1
0
 def test_transform_int(self):
     mapping = {(1, 1): "one", (2, 2): "two", (3, 3): "three"}
     transformer = MultiLookupTransformer(mapping, None)
     X = DataFrame([[1, 0], [1, 1], [2, 0], [2, 1], [2, 2], [3, 0], [3, 1],
                    [3, 2], [3, 3]])
     self.assertEqual([[None], ["one"], [None], [None], ["two"], [None],
                       [None], [None], ["three"]],
                      transformer.transform(X).tolist())
示例#2
0
 def test_transform_int(self):
     mapping = {(1, 1): "one", (2, 2): "two", (3, 3): "three"}
     transformer = MultiLookupTransformer(mapping, None)
     Y = DataFrame([[1, 0], [1, 1], [2, 0], [2, 1], [2, 2], [3, 0], [3, 1],
                    [3, 2], [3, 3]])
     Yt = transformer.transform(Y)
     self.assertEqual(
         [None, "one", None, None, "two", None, None, None, "three"],
         Yt.tolist())
示例#3
0
	def test_transform_object(self):
		mapping = {tuple(["zero"]) : "null", ("one", True) : "ein", ("two", True) : "zwei", ("three", True) : "drei"}
		with self.assertRaises(ValueError):
			MultiLookupTransformer(mapping, None)
		mapping.pop(tuple(["zero"]))
		transformer = MultiLookupTransformer(mapping, None)
		Y = DataFrame([["one", None], ["one", True], [None, True], ["two", True], ["three", True]])
		Yt = transformer.transform(Y)
		self.assertEqual([None, "ein", None, "zwei", "drei"], Yt.tolist())
		Y = numpy.matrix([["one", True], ["one", None], ["two", True]], dtype = "O")
		Yt = transformer.transform(Y)
		self.assertEqual(["ein", None, "zwei"], Yt.tolist())
示例#4
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
示例#5
0
def build_auto(regressor, name, **pmml_options):
    cylinders_origin_mapping = {
        (8, 1): "8/1",
        (6, 1): "6/1",
        (4, 1): "4/1",
        (6, 2): "6/2",
        (4, 2): "4/2",
        (4, 3): "4/3"
    }
    mapper = DataFrameMapper([
        (["cylinders", "origin"], [
            MultiDomain([CategoricalDomain(),
                         CategoricalDomain()]),
            MultiLookupTransformer(cylinders_origin_mapping,
                                   default_value="other"),
            LabelBinarizer()
        ]),
        (["cylinders"],
         Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"),
               name="odd(cylinders)",
               prefit=True)),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["model_year", "origin"], [
            MultiDomain([CategoricalDomain(),
                         CategoricalDomain()]),
            ConcatTransformer("/"),
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(RandomForestRegressor(random_state=13,
                                                      n_estimators=3),
                                threshold="1.25 * mean"))
        ]),
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["weight",
          "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {
              "alias": "weight / displacement + 0.5"
          })
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("selector", SelectUnique()),
                             ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline.configure(**pmml_options)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name)
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name)
示例#6
0
def build_auto(regressor, name, **pmml_options):
    cylinders_origin_mapping = {
        (8, 1): "8/1",
        (6, 1): "6/1",
        (4, 1): "4/1",
        (6, 2): "6/2",
        (4, 2): "4/2",
        (6, 3): "6/3",
        (4, 3): "4/3"
    }
    mapper = DataFrameMapper([
        (["cylinders", "origin"], [
            MultiDomain([CategoricalDomain(),
                         CategoricalDomain()]),
            MultiLookupTransformer(cylinders_origin_mapping,
                                   default_value="other"),
            LabelBinarizer()
        ]),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["weight", "displacement"],
         ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {
             "alias": "weight / displacement + 0.5"
         })
    ])
    pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    pipeline.configure(**pmml_options)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")