Пример #1
0
def build_iris(classifier, name, **pmml_options):
    cont_columns = [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"
    ]
    cont_mappings = [([cont_column], ContinuousDomain())
                     for cont_column in cont_columns]
    mapper = DataFrameMapper(cont_mappings)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(n=3, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(n=3, random_state=13))
    pipeline.configure(**pmml_options)
    store_pmml(pipeline, name)
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    species_proba = DataFrame(pipeline.predict_proba(iris_X),
                              columns=[
                                  "probability(setosa)",
                                  "probability(versicolor)",
                                  "probability(virginica)"
                              ])
    store_csv(pandas.concat((species, species_proba), axis=1), name)
Пример #2
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
Пример #3
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Пример #4
0
def build_auto(regressor, name, **pmml_options):
	cat_columns = ["cylinders", "model_year", "origin"]
	cont_columns = ["displacement", "horsepower", "weight", "acceleration"]
	if isinstance(regressor, LGBMRegressor):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	if isinstance(regressor, LGBMRegressor):
		pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2])
	elif isinstance(regressor, IsolationForest):
		pipeline.fit(auto_X)
	else:
		pipeline.fit(auto_X, auto_y)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	if isinstance(regressor, IsolationForest):
		decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"])
		outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"])
		outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower())
		store_csv(pandas.concat((decision_function, outlier), axis = 1), name)
	else:
		mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
		store_csv(mpg, name)
Пример #5
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Пример #6
0
def build_audit(classifier, name, **pmml_options):
	if isinstance(classifier, LGBMClassifier):
		cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Income", "Hours"]
	else:
		cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Age", "Income", "Hours"]
	if isinstance(classifier, LGBMClassifier):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	if isinstance(classifier, LGBMClassifier):
		pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5])
	elif isinstance(classifier, XGBClassifier):
		if name == "XGBoostAuditNA":
			audit_X["Age"] = audit_X["Age"].astype(float)
		pipeline.fit(audit_X, audit_y)
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
Пример #7
0
def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if with_proba == True:
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
Пример #8
0
def build_audit(mapper, classifier, name, **pmml_options):
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                               columns=["probability(0)", "probability(1)"])
    adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name)
Пример #9
0
 def test_configure(self):
     regressor = DecisionTreeRegressor()
     pipeline = PMMLPipeline([("regressor", regressor)])
     self.assertFalse(hasattr(regressor, "pmml_options_"))
     pipeline.configure()
     self.assertFalse(hasattr(regressor, "pmml_options_"))
     pipeline.configure(compact=True, flat=True)
     self.assertTrue(hasattr(regressor, "pmml_options_"))
     pmml_options = regressor.pmml_options_
     self.assertEqual(True, pmml_options["compact"])
     self.assertEqual(True, pmml_options["flat"])
Пример #10
0
def xgboost_auto():
    mapper = make_xgboost_dataframe_mapper(auto_X.dtypes,
                                           missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("regressor",
                              XGBRegressor(n_estimators=31,
                                           max_depth=3,
                                           random_state=13))])
    pipeline.fit(auto_X, auto_y)
    pipeline.configure(compact=False)
    sklearn2pmml(pipeline, "pmml/XGBoostAuto.pmml", with_repr=True)
Пример #11
0
def xgboost_audit():
    mapper = make_xgboost_dataframe_mapper(audit_X.dtypes,
                                           missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier",
                              XGBClassifier(n_estimators=71,
                                            max_depth=5,
                                            random_state=13))])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(compact=True)
    sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml", with_repr=True)
Пример #12
0
def lightgbm_audit():
    mapper, categorical_feature = make_lightgbm_dataframe_mapper(
        audit_X.dtypes, missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier",
                              LGBMClassifier(n_estimators=71,
                                             max_depth=7,
                                             random_state=13))])
    pipeline.fit(audit_X,
                 audit_y,
                 classifier__categorical_feature=categorical_feature)
    pipeline.configure(compact=True)
    sklearn2pmml(pipeline, "pmml/LightGBMAudit.pmml", with_repr=False)
Пример #13
0
def lightgbm_auto():
    mapper, categorical_feature = make_lightgbm_dataframe_mapper(
        auto_X.dtypes, missing_value_aware=False)
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("regressor",
                              LGBMRegressor(n_estimators=31,
                                            max_depth=5,
                                            random_state=13))])
    pipeline.fit(auto_X,
                 auto_y,
                 regressor__categorical_feature=categorical_feature)
    pipeline.configure(compact=True)
    sklearn2pmml(pipeline, "pmml/LightGBMAuto.pmml", with_repr=False)
Пример #14
0
def sklearn_audit(classifier, name):
    pipeline = PMMLPipeline([
        ("mapper",
         DataFrameMapper(
             [([column],
               [CategoricalDomain(), OneHotEncoder()]) for column in
              ["Employment", "Education", "Marital", "Occupation", "Gender"]] +
             [([column], ContinuousDomain())
              for column in ["Age", "Income", "Hours"]])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(compact=False)
    sklearn2pmml(pipeline, "pmml/" + name + ".pmml", with_repr=False)
Пример #15
0
def sklearn_auto(regressor, name):
    pipeline = PMMLPipeline([
        ("mapper",
         DataFrameMapper(
             [([column],
               [CategoricalDomain(), OneHotEncoder()])
              for column in ["cylinders", "model_year", "origin"]] +
             [([column], ContinuousDomain()) for column in
              ["displacement", "horsepower", "weight", "acceleration"]])),
        ("regressor", regressor)
    ])
    pipeline.fit(auto_X, auto_y)
    pipeline.configure(compact=False)
    sklearn2pmml(pipeline, "pmml/" + name + ".pmml", with_repr=False)
Пример #16
0
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("transformer", transformer),
		("densifier", DenseTransformer()),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
Пример #17
0
def build_auto_na(regressor, name):
    mapper = DataFrameMapper(
        [([column], [
            CategoricalDomain(missing_values=-1),
            CategoricalImputer(missing_values=-1),
            PMMLLabelBinarizer()
        ]) for column in ["cylinders", "model_year"]] +
        [(["origin"], [CategoricalImputer(missing_values=-1),
                       OneHotEncoder()])] +
        [(["acceleration"], [
            ContinuousDomain(missing_values=None),
            CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25],
                           labels=False),
            CategoricalImputer(),
            LabelBinarizer()
        ])] + [(["displacement"], [
            ContinuousDomain(missing_values=None),
            Imputer(),
            CutTransformer(bins=[0, 100, 200, 300, 400, 500],
                           labels=["XS", "S", "M", "L", "XL"]),
            LabelBinarizer()
        ])] + [(["horsepower"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=50,
                             high_value=225),
            Imputer()
        ])] + [(["weight"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=2000,
                             high_value=5000),
            Imputer()
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_na_X, auto_na_y)
    if isinstance(regressor, DecisionTreeRegressor):
        tree = regressor.tree_
        node_impurity = {
            node_idx: tree.impurity[node_idx]
            for node_idx in range(0, tree.node_count)
            if tree.impurity[node_idx] != 0.0
        }
        pipeline.configure(
            node_extensions={regressor.criterion: node_impurity})
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
Пример #18
0
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options):
    transformer = ColumnTransformer(
        [("all", "passthrough", ["Petal.Length", "Petal.Width"])],
        remainder="drop")
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    pipeline.configure(**pmml_options)
    pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, name)
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if with_proba == True:
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability(0)", "probability(1)"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name)
Пример #19
0
def build_sentiment(classifier, tokenizer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, tokenizer = tokenizer, stop_words = "english", ngram_range = (1, 2), norm = None, sublinear_tf = isinstance(classifier, LogisticRegressionCV), dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))),
			("count", WordCountTransformer())
		])),
		("selector", SelectKBest(f_classif, k = 1000)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba == True:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
Пример #20
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Пример #21
0
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0,
		"MALE" : 1
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name + ".csv")
Пример #22
0
        LabelBinarizer()
    ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()),
    (["Hours", "Income"],
     Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))
])
interaction_mapper = DataFrameMapper([
    ("Gender", [CategoricalDomain(), LabelBinarizer()]),
    ("Marital", [CategoricalDomain(), LabelBinarizer()])
])
classifier = XGBClassifier()

pipeline = PMMLPipeline([
    ("mapper",
     FeatureUnion([("scalar_mapper", scalar_mapper),
                   ("interaction",
                    Pipeline([("interaction_mapper", interaction_mapper),
                              ("polynomial", PolynomialFeatures())]))])),
    ("classifier", classifier)
])
pipeline.fit(audit_X, audit_y)

pipeline.configure(compact=True)
pipeline.verify(audit_X.sample(100), zeroThreshold=1e-6, precision=1e-6)

sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("XGBoostAudit", "pmml/XGBoostAudit.pmml")