Exemplo n.º 1
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 2
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name = "odd(cylinders)", prefit = True)),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["model_year", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy(SelectFromModel(RandomForestRegressor(random_state = 13, n_estimators = 3), threshold = "1.25 * mean"))]),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 3
0
	def test_fit_transform(self):
		domain = MultiDomain([ContinuousDomain(missing_value_replacement = 0.0), CategoricalDomain(missing_value_replacement = "zero")])
		X = DataFrame([[-1.0, "minus one"], [float("NaN"), None], [1.0, "one"]], columns = ["x1", "x2"])
		Xt = domain.fit_transform(X)
		self.assertTrue(isinstance(Xt, DataFrame))
		self.assertEqual([-1.0, 0.0, 1.0], Xt["x1"].tolist())
		self.assertEqual(["minus one", "zero", "one"], Xt["x2"].tolist())
		X = numpy.array([[float("NaN"), None]])
		Xt = domain.transform(X)
		self.assertTrue(isinstance(Xt, numpy.ndarray))
		self.assertTrue([0.0], Xt[:, 0].tolist())
		self.assertTrue(["zero"], Xt[:, 1].tolist())
Exemplo n.º 4
0
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options):
	continuous_mapper = DataFrameMapper([
		(["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
	])
	categorical_mapper = DataFrameMapper([
		(["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]),
		(["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]),
		(["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		(["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]),
		(["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]),
		(["Deductions"], [CategoricalDomain()]),
	])
	pipeline = Pipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 5
0
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}):
	marital_mapping = {
		"Married-spouse-absent" : "Married"
	}
	mapper = DataFrameMapper(
		[([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] +
		[(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] +
		[(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] +
		[(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] +
		[(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] +
		[([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]]
	)
	pipeline = Pipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 6
0
def build_audit_cat(classifier, name, with_proba=True, **fit_params):
    mapper = DataFrameMapper(
        [([column], ContinuousDomain())
         for column in ["Age", "Income"]] + [(["Hours"], [
             ContinuousDomain(),
             CutTransformer(bins=[0, 20, 40, 60, 80, 100],
                            labels=False,
                            right=False,
                            include_lowest=True)
         ])] + [(["Employment", "Education", "Marital", "Occupation"], [
             MultiDomain([
                 CategoricalDomain(),
                 CategoricalDomain(),
                 CategoricalDomain(),
                 CategoricalDomain()
             ]),
             OrdinalEncoder()
         ])] +
        [([column], [CategoricalDomain(), LabelEncoder()])
         for column in ["Gender", "Deductions"]])
    pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y, **fit_params)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name)
Exemplo n.º 7
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = Pipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
Exemplo n.º 8
0
def build_audit(classifier, name, with_proba=True, **kwargs):
    continuous_mapper = DataFrameMapper([
        (["Age", "Income",
          "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
    ])
    categorical_mapper = DataFrameMapper([
        (["Employment"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        (["Education"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        (["Marital"], [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        (["Occupation"],
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        (["Gender"],
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        (["Deductions"], [CategoricalDomain(),
                          LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13))
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")