Exemplo n.º 1
0
def build_audit(classifier, name, **pmml_options):
	if isinstance(classifier, LGBMClassifier):
		cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Income", "Hours"]
	else:
		cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Age", "Income", "Hours"]
	if isinstance(classifier, LGBMClassifier):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	if isinstance(classifier, LGBMClassifier):
		pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5])
	elif isinstance(classifier, XGBClassifier):
		if name == "XGBoostAuditNA":
			audit_X["Age"] = audit_X["Age"].astype(float)
		pipeline.fit(audit_X, audit_y)
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
Exemplo n.º 2
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 3
0
def build_auto(regressor, name, **pmml_options):
	cat_columns = ["cylinders", "model_year", "origin"]
	cont_columns = ["displacement", "horsepower", "weight", "acceleration"]
	if isinstance(regressor, LGBMRegressor):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	if isinstance(regressor, LGBMRegressor):
		pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2])
	elif isinstance(regressor, IsolationForest):
		pipeline.fit(auto_X)
	else:
		pipeline.fit(auto_X, auto_y)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	if isinstance(regressor, IsolationForest):
		decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"])
		outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"])
		outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower())
		store_csv(pandas.concat((decision_function, outlier), axis = 1), name)
	else:
		mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
		store_csv(mpg, name)
Exemplo n.º 4
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 5
0
def main(input_data, output_data, model_dest):
    logger = logging.getLogger(__name__)
    logger.info("Loading input and output data")
    inputs = pd.read_csv(input_data)
    outputs = pd.read_csv(output_data)
    X_train, X_test, y_train, y_test = train_test_split(inputs,
                                                        outputs,
                                                        test_size=0.4,
                                                        random_state=23)

    model = RandomForestRegressor(verbose=True, n_jobs=-1)

    logger.info("Fitting model")
    model.fit(X_train, y_train)

    model.fit(X_train, y_train)
    logger.info("Saving joblib model")
    dump(model, model_dest + ".joblib")

    pipeline = PMMLPipeline([("classifier",
                              RandomForestRegressor(verbose=True, n_jobs=-1))])
    pipeline.fit(X_train, y_train)
    pipeline.verify(X_test.sample(n=10))

    logger.info("Saving PMML model")
    sklearn2pmml(pipeline, model_dest + ".pmml")
Exemplo n.º 6
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
Exemplo n.º 7
0
def build_iris(classifier, name, **pmml_options):
    cont_columns = [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"
    ]
    cont_mappings = [([cont_column], ContinuousDomain())
                     for cont_column in cont_columns]
    mapper = DataFrameMapper(cont_mappings)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(n=3, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(n=3, random_state=13))
    pipeline.configure(**pmml_options)
    store_pmml(pipeline, name)
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    species_proba = DataFrame(pipeline.predict_proba(iris_X),
                              columns=[
                                  "probability(setosa)",
                                  "probability(versicolor)",
                                  "probability(virginica)"
                              ])
    store_csv(pandas.concat((species, species_proba), axis=1), name)
Exemplo n.º 8
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
Exemplo n.º 9
0
def build_auto_isotonic(regressor, auto_isotonic_X, name):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_isotonic_X, auto_y)
	pipeline.verify(auto_isotonic_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_isotonic_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 10
0
def build_auto_opt(regressor, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 11
0
def build_iris_opt(classifier, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)
Exemplo n.º 12
0
def build_auto_na_hist(regressor, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["displacement", "horsepower", "weight", "acceleration"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 13
0
def build_visit(regressor, name):
	mapper = DataFrameMapper(
		[(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] +
		[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] +
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(visit_X, visit_y)
	pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
	store_csv(docvis, name)
Exemplo n.º 14
0
def build_audit_na_direct(classifier, name):
	mapper = DataFrameMapper([
		(["Age", "Hours", "Income"], None),
		(["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 15
0
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options):
    transformer = ColumnTransformer(
        [("all", "passthrough", ["Petal.Length", "Petal.Width"])],
        remainder="drop")
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    pipeline.configure(**pmml_options)
    pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, name)
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if with_proba == True:
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability(0)", "probability(1)"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name)
Exemplo n.º 16
0
def build_audit_na_hist(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 17
0
def main(input_data, output_data, model_dest):
    logger = logging.getLogger(__name__)
    logger.info("Loading input and output data")
    inputs = pd.read_csv(input_data)
    outputs = pd.read_csv(output_data)
    X_train, X_test, y_train, y_test = train_test_split(inputs,
                                                        outputs,
                                                        test_size=0.4,
                                                        random_state=23)

    model = RandomForestClassifier(verbose=True, n_jobs=-1)

    logger.info("Fitting model")
    model = model.fit(X_train, y_train["price_range"].ravel())

    logger.info("Saving joblib model")
    dump(model, model_dest + ".joblib")

    pipeline = PMMLPipeline([("classifier",
                              RandomForestClassifier(verbose=True,
                                                     n_jobs=-1))])
    pipeline.fit(X_train, y_train["price_range"].ravel())
    pipeline.verify(X_test.sample(n=10))

    logger.info("Saving PMML model")
    skl_to_pmml(
        pipeline,
        [
            "battery_power",
            "clock_speed",
            "fc",
            "int_memory",
            "m_dep",
            "mobile_wt",
            "n_cores",
            "pc",
            "px_height",
            "px_width",
            "ram",
            "sc_h",
            "sc_w",
            "talk_time",
        ],
        "price_range",
        model_dest + ".pmml",
    )
Exemplo n.º 18
0
def build_audit_h2o(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("uploader", H2OFrameCreator()),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"]))
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	classifier = pipeline._final_estimator
	store_mojo(classifier, name)
	store_pkl(pipeline, name)
	adjusted = pipeline.predict(audit_X)
	adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
	store_csv(adjusted.as_data_frame(), name)
Exemplo n.º 19
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 20
0
 def test_fit_verify(self):
     pipeline = PMMLPipeline([("estimator", DummyRegressor())])
     self.assertFalse(hasattr(pipeline, "active_fields"))
     self.assertFalse(hasattr(pipeline, "target_fields"))
     X = DataFrame([[1, 0], [2, 0], [3, 0]], columns=["X1", "X2"])
     y = Series([0.5, 1.0, 1.5], name="y")
     pipeline.fit(X, y)
     self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist())
     self.assertEqual("y", pipeline.target_fields.tolist())
     X.columns = ["x1", "x2"]
     pipeline.fit(X, y)
     self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist())
     self.assertEqual("y", pipeline.target_fields.tolist())
     self.assertFalse(hasattr(pipeline, "verification"))
     pipeline.verify(X.sample(2))
     self.assertEqual(2, len(pipeline.verification.active_values))
     self.assertEqual(2, len(pipeline.verification.target_values))
     X.columns = ["x2", "x1"]
     with self.assertRaises(ValueError):
         pipeline.verify(X.sample(2))
Exemplo n.º 21
0
def main(input_data, output_data, model_dest):
    logger = logging.getLogger(__name__)
    logger.info("Loading input and output data")
    inputs = pd.read_csv(input_data)
    outputs = pd.read_csv(output_data)
    X_train, X_test, y_train, y_test = train_test_split(inputs,
                                                        outputs,
                                                        test_size=0.4,
                                                        random_state=23)

    logger.info("Create model (tuned hyperparams)")
    model = RandomForestClassifier(
        max_depth=8,
        max_leaf_nodes=64,
        max_samples=0.5,
        n_estimators=10,
        verbose=True,
        n_jobs=-1,
    )

    logger.info("Fitting model")
    model = model.fit(X_train, y_train)

    logger.info("Saving joblib model")
    dump(model, model_dest + ".joblib")

    pipeline = PMMLPipeline([("classifier",
                              RandomForestClassifier(verbose=True,
                                                     n_jobs=-1))])
    pipeline.fit(X_train, y_train)
    pipeline.verify(X_test.sample(n=10))

    logger.info("Saving PMML model")
    # sklearn2pmml(pipeline, model_dest + ".pmml")
    skl_to_pmml(
        pipeline,
        ["Age", "Debt", "YearsEmployed", "Income"],
        "Approved",
        model_dest + ".pmml",
    )
Exemplo n.º 22
0
        LabelBinarizer()
    ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()),
    (["Hours", "Income"],
     Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))
])
interaction_mapper = DataFrameMapper([
    ("Gender", [CategoricalDomain(), LabelBinarizer()]),
    ("Marital", [CategoricalDomain(), LabelBinarizer()])
])
classifier = XGBClassifier()

pipeline = PMMLPipeline([
    ("mapper",
     FeatureUnion([("scalar_mapper", scalar_mapper),
                   ("interaction",
                    Pipeline([("interaction_mapper", interaction_mapper),
                              ("polynomial", PolynomialFeatures())]))])),
    ("classifier", classifier)
])
pipeline.fit(audit_X, audit_y)

pipeline.configure(compact=True)
pipeline.verify(audit_X.sample(100), zeroThreshold=1e-6, precision=1e-6)

sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("XGBoostAudit", "pmml/XGBoostAudit.pmml")
                     sublinear_tf=True,
                     max_df=0.1,
                     min_df=0.0001,
                     norm=None,
                     tokenizer=Splitter())),
    (
        'linear',
        SGDClassifier(
            # params
        )),
])
# Train the model
pipeline.fit(train_data, train_labels)

# Pack verification data and verify
pipeline.verify(test_data)

# Save the pipeline + model
sklearn2pmml.sklearn2pmml(sklearn2pmml.make_pmml_pipeline(
    pipeline, active_fields=FEATURE_COLS, target_fields=TARGET_COL),
                          'model.pmml',
                          with_repr=True,
                          debug=True)

# Measure the accuracy
descision = pipeline.decision_function(test_data.sample(1))

jpmml_input_data = pd.read_csv("jpmml-test-input.csv", usecols=FEATURE_COLS)
input_pred = pipeline.predict(jpmml_input_data.squeeze())
with numpy.printoptions(threshold=numpy.inf):
    print(input_pred)
Exemplo n.º 24
0
    def __init__(self):
        self.n_splits = 1

    def split(self, X, y, groups=None):
        yield (numpy.arange(len(X)), numpy.arange(len(y)))

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits


pipeline = PMMLPipeline([("mapper", mapper),
                         ("ensemble",
                          StackingClassifier([("lightgbm", lightgbm_pipeline),
                                              ("xgboost", xgboost_pipeline),
                                              ("sklearn", sklearn_pipeline)],
                                             final_estimator=final_estimator,
                                             cv=DisabledCV(),
                                             passthrough=False))])

# pipeline = PMMLPipeline([
#     ("mapper", mapper),
#     ("ensemble", StackingClassifier([
#         ("lightgbm", lightgbm_pipeline), ("sklearn", sklearn_pipeline)
#     ], final_estimator=final_estimator, cv=DisabledCV(), passthrough=False))
# ])

pipeline.fit(df_X, df_y)
pipeline.verify(df_X.sample(n=10, random_state=13))

sklearn2pmml(pipeline, "../pmml/StackingEnsembleAudit.pmml")
Exemplo n.º 25
0
                          ("Income", ContinuousDomain()),
                          (["Hours", "Income"],
                           Alias(ExpressionTransformer("X[1] / (X[0] * 52)"),
                                 "Hourly_Income"))])
classifier = H2ORandomForestEstimator(ntrees=17)

predict_proba_transformer = Pipeline([
    ("expression", ExpressionTransformer("X[1]")),
    ("cut",
     Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0],
                          labels=["no", "maybe", "yes"]),
           "Decision",
           prefit=True))
])

pipeline = PMMLPipeline([("local_mapper", mapper),
                         ("uploader", H2OFrameCreator()),
                         ("remote_classifier", classifier)],
                        predict_proba_transformer=predict_proba_transformer)
pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(),
                               column_types=["categorical"]))

pipeline.verify(audit_X.sample(100))

sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
Exemplo n.º 26
0
	store_pkl(pipeline, "SelectFirstIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, "SelectFirstIris")

if "Iris" in datasets:
	classifier = RuleSetClassifier([
		("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"),
		("X['Petal.Length'] >= 2.45", "virginica")
	], default_score = "setosa")
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y)
	pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, "RuleSetIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	store_csv(species, "RuleSetIris")

#
# Text classification
#

sentiment_X, sentiment_y = load_sentiment("Sentiment")

def build_sentiment(classifier, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, token_pattern = None, tokenizer = Splitter(), stop_words = "english", ngram_range = (1, 2), norm = None, dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)