Exemplo n.º 1
0
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["drv", "class"]
text_features = ["model"]

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), OneHotEncoder()])
     for f in categorical_features] + [(f, [
         CategoricalDomain(),
         CountVectorizer(tokenizer=Splitter(), max_features=5)
     ]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper), ("model", LinearRegression())])
pipeline.fit(data, data["hwy"])

sklearn2pmml(pipeline, "test/support/python/linear_regression_text.pmml")

print(list(pipeline.predict(data[:10])))
Exemplo n.º 2
0
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)

if "Iris" in datasets:
	build_iris_opt(LGBMClassifier(objective = "multiclass"), "LGBMIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "multi_logloss", "classifier__early_stopping_rounds" : 3})
	build_iris_opt(XGBClassifier(objective = "multi:softprob"), "XGBIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "mlogloss", "classifier__early_stopping_rounds" : 3})

if "Iris" in datasets:
	pipeline = PMMLPipeline([
		("mapper", DataFrameMapper([
			(iris_X.columns.values, ContinuousDomain())
		])),
		("classifier", SelectFirstClassifier([
			("select", Pipeline([
				("classifier", DecisionTreeClassifier(random_state = 13))
			]), "X[1] <= 3"),
			("default", Pipeline([
				("scaler", StandardScaler()),
				("classifier", LogisticRegression(multi_class = "ovr", solver = "liblinear"))
			]), str(True))
		]))
	])
	pipeline.fit(iris_X, iris_y)
	store_pkl(pipeline, "SelectFirstIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, "SelectFirstIris")

if "Iris" in datasets:
	classifier = RuleSetClassifier([
Exemplo n.º 3
0
	build_iris_opt(LGBMClassifier(objective = "multiclass"), "LGBMIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "multi_logloss", "classifier__early_stopping_rounds" : 3})
	build_iris_opt(XGBClassifier(objective = "multi:softprob"), "XGBIris", fit_params = {"classifier__eval_set" : [(iris_X[iris_test_mask], iris_y[iris_test_mask])], "classifier__eval_metric" : "mlogloss", "classifier__early_stopping_rounds" : 3})

if "Iris" in datasets:
	mapper = DataFrameMapper([
		(iris_X.columns.values, ContinuousDomain())
	])
	iris_Xt = mapper.fit_transform(iris_X)
	dt_classifier = DecisionTreeClassifier(random_state = 13)
	dt_classifier.fit(iris_Xt, iris_y)
	lr_classifier = LogisticRegression(multi_class = "ovr", solver = "liblinear")
	lr_classifier.fit(iris_Xt, iris_y)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("estimator", SelectFirstEstimator([
			("X[2] <= 3", dt_classifier),
			(str(True), lr_classifier)
		]))
	])
	pipeline.active_fields = iris_X.columns.values
	pipeline.target_fields = ["Species"]
	store_pkl(pipeline, "SelectFirstIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	store_csv(species, "SelectFirstIris")

if "Iris" in datasets:
	classifier = RuleSetClassifier([
		("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"),
		("X['Petal.Length'] >= 2.45", "virginica")
	], default_score = "setosa")
	pipeline = PMMLPipeline([
Exemplo n.º 4
0
                       labels=["q1", "q2", "q3", "q4"]),
        LabelBinarizer()
    ]), ("Hours", ContinuousDomain()), ("Income", ContinuousDomain()),
    (["Hours", "Income"],
     Alias(ExpressionTransformer("X[1] / (X[0] * 52)"), "Hourly_Income"))
])
interaction_mapper = DataFrameMapper([
    ("Gender", [CategoricalDomain(), LabelBinarizer()]),
    ("Marital", [CategoricalDomain(), LabelBinarizer()])
])
classifier = XGBClassifier()

pipeline = PMMLPipeline([
    ("mapper",
     FeatureUnion([("scalar_mapper", scalar_mapper),
                   ("interaction",
                    Pipeline([("interaction_mapper", interaction_mapper),
                              ("polynomial", PolynomialFeatures())]))])),
    ("classifier", classifier)
])
pipeline.fit(audit_X, audit_y)

pipeline.configure(compact=True)
pipeline.verify(audit_X.sample(100), zeroThreshold=1e-6, precision=1e-6)

sklearn2pmml(pipeline, "pmml/XGBoostAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("XGBoostAudit", "pmml/XGBoostAudit.pmml")
Exemplo n.º 5
0
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.ruleset import RuleSetClassifier

import pandas
import sys

iris_df = pandas.read_csv("csv/Iris.csv")
#print(iris_df.head(5))

iris_X = iris_df[iris_df.columns.difference(["Species"])]
iris_y = iris_df["Species"]

classifier = RuleSetClassifier([
	("X['Petal_Length'] < 2.45", "setosa"),
	("X['Petal_Width'] < 1.75", "versicolor"),
], default_score = "virginica")

pipeline = PMMLPipeline([
	("classifier", classifier)
])
pipeline.fit(iris_X, iris_y)

sklearn2pmml(pipeline, "pmml/RuleSetIris.pmml")

if "--deploy" in sys.argv:
	from openscoring import Openscoring

	os = Openscoring("http://localhost:8080/openscoring")
	os.deployFile("RuleSetIris", "pmml/RuleSetIris.pmml")
Exemplo n.º 6
0
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

binary = False

data = pd.read_csv("test/support/mpg.csv")
if binary:
    data["drv"] = data["drv"].replace("r", "4")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), PMMLLabelEncoder()])
     for f in categorical_features] +
    [(f, [CategoricalDomain(),
          CountVectorizer(tokenizer=Splitter())]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper),
                         ("model", LGBMClassifier(n_estimators=1000))])
pipeline.fit(data, data["drv"], model__categorical_feature=[3])

suffix = "binary" if binary else "multiclass"
sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml")

print(list(pipeline.predict(data[:10])))
print(list(pipeline.predict_proba(data[0:1])[0]))
Exemplo n.º 7
0
                          ("Income", ContinuousDomain()),
                          (["Hours", "Income"],
                           Alias(ExpressionTransformer("X[1] / (X[0] * 52)"),
                                 "Hourly_Income"))])
classifier = H2ORandomForestEstimator(ntrees=17)

predict_proba_transformer = Pipeline([
    ("expression", ExpressionTransformer("X[1]")),
    ("cut",
     Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0],
                          labels=["no", "maybe", "yes"]),
           "Decision",
           prefit=True))
])

pipeline = PMMLPipeline([("local_mapper", mapper),
                         ("uploader", H2OFrameCreator()),
                         ("remote_classifier", classifier)],
                        predict_proba_transformer=predict_proba_transformer)
pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(),
                               column_types=["categorical"]))

pipeline.verify(audit_X.sample(100))

sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
Exemplo n.º 8
0
'''
@Author: Runsen
@微信公众号: 润森笔记
@博客: https://blog.csdn.net/weixin_44510615
@Date: 2020/5/24
'''
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
pipeline = PMMLPipeline([("pca", PCA(n_components=3)), ("classifier", SVC())])
iris = load_iris()
pipeline.fit(iris.data, iris.target)
sklearn2pmml(pipeline, "iris_SVC.pmml", with_repr=True)
Exemplo n.º 9
0
import pandas

iris_df = pandas.read_csv("Iris.csv")

from sklearn_pandas import DataFrameMapper
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline

pipeline = PMMLPipeline([
    ("mapper",
     DataFrameMapper([
         (["Sepal.Length", "Sepal.Width", "Petal.Length",
           "Petal.Width"], [ContinuousDomain(), Imputer()])
     ])), ("pca", PCA(n_components=3)), ("selector", SelectKBest(k=2)),
    ("classifier", LogisticRegression())
])
pipeline.fit(iris_df, iris_df["Species"])

from sklearn2pmml import sklearn2pmml

sklearn2pmml(pipeline, "LogisticRegressionIris.pmml", with_repr=True)
Exemplo n.º 10
0
class AutoBuilder:
    """'
    E2E classifier builder
    
    Builds binary classifier, including:
        - dataset EDA (optional)
        - hyperparameter tuning (optional)
        - model performance assessment
        - SHAP-based feature analysis
        - feature selection
        - creating deployment package (pmml & pkl)

    Attributes:
        auto_build (method): automatically builds bin populates output_dir path, with model artifacts, and evaluation charts

    """
    def __init__(
        self,
        output_dir_path,
        csv_path,
        target_col="target",
        ignore_cols=[],
        eda_flag=True,
        tune_flag=True,
        cardinality_threshold=100,
        shap_plot_num=10,
        shap_frac=0.05,
        importance_cutoff=0.00,
        corr_cutoff=0.9,
        search_space=LGB_SEARCH_SPACE,
        tuning_iters=25,
        lgb_params={},
        random_state=1234,
    ):
        """
        Args:
            output_dir_path (string):  filepath where outputs package is created and saved
            csv_path (string): filepath to input csv, NOTE need to preprocess columns to be numeric or string type
            target_col (string, optional): target column, default 'target'
            ignore_cols (iterable, optional): columns to be dropped, default []
            eda_flag (boolean, optional): EDA plots to be generated, default True
            tune_flag (boolean, optional): Lightgbm hyperparameters to be tuned, default True
            shap_plot_num (numeric, optional): Generate SHAP dependency plots for N most important features, default 10
            shap_frac (numeric, optional): Proportion of data sampled for SHAP analysis, default 5%
            importance_cutoff (numeric, optional): Abs. avg. SHAP value threshold suggest dropping feature, default 0.00
            corr_cutoff (numeric, optional): Abs. avg. correlation suggest dropping feature, default 0.9
            search_space (numeric, optional): Tuning space for Bayesian optimisation, default is SKOPT_SEARCH_SPACE
            tuning_iter (numeric, optional): number of tuning iterations for Bayesian optimisation, default is 25,
            lgb_params (dict, optional): Hyperparams to use in case when tune_flag = False, default None
            random_state (numeric, optional): Random seed for train test split, and model-training - default is 1234
        """
        self.output_dir_path = output_dir_path
        self.csv_path = csv_path
        self.target_col = target_col
        self.ignore_cols = ignore_cols
        self.eda_flag = eda_flag
        self.tune_flag = tune_flag
        self.cardinality_threshold = cardinality_threshold
        self.shap_plot_num = shap_plot_num
        self.shap_frac = shap_frac
        self.importance_cutoff = importance_cutoff
        self.corr_cutoff = corr_cutoff
        self.search_space = search_space
        self.tuning_iters = tuning_iters
        self.lgb_params = lgb_params
        self.random_state = random_state

    def _gen_model_dir(self):
        """
        Creates output directory according to self.output_dir_path, removing previous output if there.

        Also makes subdirectories
            /bin
            /plots
        """
        logger.info(f"building directory {self.csv_path}")
        if os.path.exists(self.output_dir_path) and os.path.isdir(
                self.output_dir_path):
            shutil.rmtree(self.output_dir_path)
        os.mkdir(self.output_dir_path)
        os.mkdir(self.output_dir_path + "/bin")
        os.mkdir(self.output_dir_path + "/plots")

    def _process_csv(self):
        """
        Parses csv specified in self.csv_path, saving to self.raw

        Also
            - Drops ignore columns
            - Validates target and feature columns
                Target = binary, 0-1
                Features = numeric or string
        """
        logger.info(f"loading file {self.csv_path}")
        raw = pd.read_csv(self.csv_path).drop(columns=self.ignore_cols)

        logger.info("checking valid input data")
        assert raw[self.target_col].isna().sum() == 0

        assert list(sorted(raw[self.target_col].unique())) == [0, 1]

        valid_shape = raw.select_dtypes(
            include=["int64", "float64", "object"]).shape
        assert valid_shape == raw.shape
        self.raw = raw
        raw.to_csv(f"{self.output_dir_path}/bin/raw.csv")

    def _prepare_X_y(self):
        """
        Splits self raw into X_train y_train, X_test, y_test 

        Also records categorical and numerical columns, and saves csv of training set
        """

        y = self.raw[self.target_col]
        X = self.raw.drop(columns=self.target_col)

        logger.info("train test split")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.20, random_state=self.random_state)
        data_train = self.X_train.copy()
        data_train["target"] = self.y_train

        training_data_path = f"{self.output_dir_path}/bin/train.csv"
        data_train.to_csv(training_data_path, index=False)

        del X, y

    def _create_categorical_transformer(self):
        self.categorical_cols = self.X_train.select_dtypes(
            include=["object"]).columns
        self.numeric_cols = self.X_train.select_dtypes(
            include=["int64", "float64"]).columns

        self.mapper = DataFrameMapper(
            [([cat_column],
              [CategoricalDomain(), LabelEncoder()])
             for cat_column in self.categorical_cols] +
            [(self.numeric_cols, ContinuousDomain())])

        # hacky, also storing seperated X_train_encoded and classifier, because couldn't get SHAP and skopt to work for e2e pipeline
        self.X_train_encoded = self.mapper.fit_transform(self.X_train)
        self.var_names = self.X_train.columns

    def _tune(self):
        """
        Explores tuning space, updating self.lgb_params with values that minimize cross-validated brier score
        """
        # todo, can I save memory, code and possibly tune binning strats by passing unencoded X_train into pipeline?
        logger.info(f"tuning {self.tuning_iters}")
        results = utils.bayes_hyperparam_tune(
            model=lgb.LGBMClassifier(objective="binary"),
            X=self.X_train_encoded,
            y=self.y_train,
            search_space=self.search_space,
            n_iters=self.tuning_iters,
        )
        self.lgb_params = results.best_params_
        logger.info(f"best params {self.lgb_params}")

    def _save_model(self):
        """
        Saves sklearn pipeline as pkl and pmml files, also saves training file

        Args:
            pipeline (lightgbm pipeline) model to be saved
            output_dir (string): path to save model outputs
            train (df): dataset to save
        """
        pmml_path = f"{self.output_dir_path}/model-pmml.pmml"
        pkl_path = f"{self.output_dir_path}/model-bin.pkl"
        pickle.dump(self.pipeline, open(pkl_path, "wb"))
        # sklearn2pmml(self.pipeline, pmml_path)

    def _generate_shap_plots(self):
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        classifier.fit(self.X_train_encoded, self.y_train)
        X_shap = pd.DataFrame(data=self.X_train_encoded,
                              columns=self.var_names)
        self.feature_importance = utils.create_shap_plots(
            classifier,
            X_shap,
            output_dir=self.output_dir_path,
            N=self.shap_plot_num,
            frac=self.shap_frac,
        )

    def auto_build(self):
        """
        Populates output_dir path, with model artifacts, and evalution charts
        """
        self._gen_model_dir()

        self._process_csv()

        self._prepare_X_y()

        if self.eda_flag:
            logger.info("EDA")
            utils.dataset_eda(data=self.X_train,
                              output_dir=self.output_dir_path)

        self._create_categorical_transformer()

        if self.tune_flag:
            self._tune()

        self._generate_shap_plots()

        logger.info("creating pipeline")
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        self.pipeline = PMMLPipeline([("mapper", self.mapper),
                                      ("classifier", classifier)])

        self.pipeline.fit(self.X_train, self.y_train)

        logger.info("Assessing model")

        y_pred = self.pipeline.predict_proba(self.X_test)[:, 1]
        y_bm = np.repeat(self.y_train.mean(), self.y_test.shape)
        utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path,
                             "Model")

        logger.info("suggeting features to remove")
        self.cols_to_remove = utils.find_features_to_remove(
            importance=self.feature_importance,
            X=self.X_train,
            importance_cutoff=self.importance_cutoff,
            corr_threshold=self.corr_cutoff,
        )
        logger.info(f"candidates to remove - {self.cols_to_remove}")

        logger.info(f"saving model \n{self.output_dir_path}")

        self._save_model()
        test_input = dict(self.X_test.iloc[0])
        test_score = self.pipeline.predict_proba(self.X_test.head(1))
        logger.info(
            f"test-case model inputs \n{ test_input } \n model score \n {test_score}"
        )

        logger.info("done!")
Exemplo n.º 11
0
    build_iris(VotingClassifier([("dt",
                                  DecisionTreeClassifier(random_state=13)),
                                 ("nb", GaussianNB()),
                                 ("lr", LogisticRegression())]),
               "VotingEnsembleIris",
               with_proba=False)
    build_iris(OptimalXGBClassifier(objective="multi:softprob", ntree_limit=7),
               "XGBIris",
               ntree_limit=7)

if "Iris" in datasets:
    classifier = RuleSetClassifier(
        [("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75",
          "versicolor"), ("X['Petal.Length'] >= 2.45", "virginica")],
        default_score="setosa")
    pipeline = PMMLPipeline([("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    pipeline.verify(iris_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, "RuleSetIris")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    store_csv(species, "RuleSetIris")

#
# Text classification
#

sentiment_X, sentiment_y = load_sentiment("Sentiment")


def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
Exemplo n.º 12
0
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), OneHotEncoder()])
     for f in categorical_features] +
    [(f, [CategoricalDomain(),
          CountVectorizer(tokenizer=Splitter())]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper), ("model", GaussianNB())])
pipeline.fit(data, data["drv"])

sklearn2pmml(pipeline, "test/support/python/naive_bayes.pmml")

print(pipeline.predict(data[:10]))
# raw_data=raw_data.sample(frac=0.03)

# 将非数值型的数据转换为数值型数据
# print("Transforming data...")
raw_data[last_column_index], attacks = pd.factorize(
    raw_data[last_column_index], sort=True)
# 对原始数据进行切片,分离出特征和标签,第1~41列是特征,第42列是标签
features = raw_data.iloc[:, :raw_data.shape[1] - 1]  # pandas中的iloc切片是完全基于位置的索引
labels = raw_data.iloc[:, raw_data.shape[1] - 1:]
# 数据标准化
# features = preprocessing.scale(features)
# features = pd.DataFrame(features)
# 将多维的标签转为一维的数组
labels = labels.values.ravel()

# 将数据分为训练集和测试集,并打印维数
df = pd.DataFrame(features)
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    labels,
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    stratify=labels)

pipeline = PMMLPipeline([("classifier",
                          DecisionTreeClassifier(criterion='entropy',
                                                 max_depth=12,
                                                 min_samples_leaf=1,
                                                 splitter="best"))])
pipeline.fit(X_train, y_train)
sklearn2pmml(pipeline, "data/pmml/DecisionTreeIris.pmml", with_repr=True)
feature_preprocessing = compose.ColumnTransformer(
    [('cat_feature_pipeline', cat_feature_pipeline, cat_features_list),
     ('num_feature_pipeline', num_feature_pipeline, num_features_list)],
    n_jobs=10)

features_pipeline = pipeline.FeatureUnion(
    [('pca_selector', decomposition.PCA(n_components=0.90)),
     ('et_selector',
      feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()))],
    n_jobs=20)

classifier = naive_bayes.GaussianNB()
#build complete pipeline with feature selection and ml algorithms
complete_pipeline = PMMLPipeline([('preprocess', feature_preprocessing),
                                  ('zv_filter',
                                   feature_selection.VarianceThreshold()),
                                  ('features', features_pipeline),
                                  ('tree', classifier)])

pipeline_grid = {}
grid_estimator = model_selection.GridSearchCV(complete_pipeline,
                                              pipeline_grid,
                                              scoring="accuracy",
                                              cv=5,
                                              verbose=10,
                                              n_jobs=20)
grid_estimator.fit(X_train, y_train)
print(grid_estimator.best_estimator_)
print(grid_estimator.best_params_)
print(grid_estimator.best_score_)
Exemplo n.º 15
0
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

import pandas as pd
import numpy as np

data = load_iris()
feature_cols = data['feature_names']
df = pd.DataFrame(data=np.c_[data['data'], data['target']],
                  columns=data['feature_names'] + ['target'])

X, y = df[feature_cols], df["target"]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.12,
                                                    random_state=42)

print(X_test.head())
print(y_test.head())

pipeline = PMMLPipeline([("classifier", GradientBoostingClassifier())])
pipeline.fit(X_train, y_train)

sklearn2pmml(
    pipeline,
    "/Users/tcfartunc/Dev/workspaces/python/python_model_server/pmmls/GBIris.pmml",
    with_repr=True)
Exemplo n.º 16
0
        FILE_PREFIX, train_transaction))

    identity_test_df = pd.read_csv("{}/{}".format(FILE_PREFIX, test_identity))
    X_final = pd.read_csv("{}/{}".format(FILE_PREFIX, test_transaction))
    print("===============finish file loading=================")
    Y_train = transaction_train_df_raw['isFraud']
    X_train = transaction_train_df_raw.drop('isFraud', axis=1)  # 506691

    preprocessor = PreProcessor(transaction_train_df_raw, identity_train_df)
    X_train_after_processing = preprocessor.preprocess()
    model = RandomForestClassifier(n_estimators=100, random_state=0)
    #my_pipeline = Pipeline(steps=[('preprocessor', preprocessor_pipeline), ('model', model)])

    # Preprocessing of training data, fit model
    # my_pipeline.fit(X_train, Y_train)
    #my_pipeline.fit(X_train, Y_train)

    from sklearn2pmml.pipeline import PMMLPipeline

    pipeline = PMMLPipeline([
        #("preprocessing", dataFrameMapper),
        ("classifier", model)
    ])
    pipeline.fit(X_train_after_processing, Y_train)

    from sklearn2pmml import sklearn2pmml

    sklearn2pmml(pipeline, "model.pmml", with_repr=True)

    # Preprocessing of validation data, get predictions
    #preds = my_pipeline.predict(X_test)
Exemplo n.º 17
0
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

if __name__ == '__main__':
    fp = "iris.pmml"
    iris = load_iris()  # load the famous dat set
    X = iris.data  # features with four variables
    y = iris.target  # training target
    pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier())])
    pipeline.fit(X, y)  # fit the model
    sklearn2pmml(pipeline, fp, with_repr=True)  # create output file

    # features and targets
    print(f'features: {X[0, :]}, target: {y[0]}')
    print(f'features: {X[1, :]}, target: {y[1]}')
Exemplo n.º 18
0
def build_iris(classifier, name, **pmml_options):
	cont_columns = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]
	cont_mappings = [([cont_column], ContinuousDomain()) for cont_column in cont_columns]
	mapper = DataFrameMapper(cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(iris_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(iris_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	store_csv(pandas.concat((species, species_proba), axis = 1), name)
Exemplo n.º 19
0
def build_audit_na(classifier, name, with_proba=True, **kwargs):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0, "MALE": 1}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"),
              name="flag_missing(Hours, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        Imputer()
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")
Exemplo n.º 20
0
def build_auto(regressor, name, **pmml_options):
	cat_columns = ["cylinders", "model_year", "origin"]
	cont_columns = ["displacement", "horsepower", "weight", "acceleration"]
	if isinstance(regressor, LGBMRegressor):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], [cont_domain(name)]) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	if isinstance(regressor, LGBMRegressor):
		pipeline.fit(auto_X, auto_y, regressor__categorical_feature = [0, 1, 2])
	elif isinstance(regressor, IsolationForest):
		pipeline.fit(auto_X)
	else:
		pipeline.fit(auto_X, auto_y)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	if isinstance(regressor, IsolationForest):
		decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"])
		outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"])
		outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower())
		store_csv(pandas.concat((decision_function, outlier), axis = 1), name)
	else:
		mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
		store_csv(mpg, name)
Exemplo n.º 21
0
iris = load_iris()
data = iris.data
target = iris.target

# os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin'
# X=[[1,2,3,1],[2,4,1,5],[7,8,3,6],[4,8,4,7],[2,5,6,9]]
# y=[0,1,0,2,1]

# pipeline = PMMLPipeline([("classifier", tree.DecisionTreeClassifier(random_state=9))]);
# pipeline.fit(data,target)
# sklearn2pmml(pipeline, "tree_result.pmml")

# url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
# names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
# dataframe = pd.read_csv(url, names=names)
# print(dataframe)
# array = dataframe.values
# X = array[:,0:8]
# Y = array[:,8]
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    data, target, test_size=test_size, random_state=seed)
# Fit the model on training set
# model = LogisticRegression()
# model.fit(X_train, Y_train)

pipeline = PMMLPipeline([("classifier", LogisticRegression())])
pipeline.fit(X_train, Y_train)
sklearn2pmml(pipeline, "logit_result.pmml")
Exemplo n.º 22
0
def build_audit(classifier, name, **pmml_options):
	if isinstance(classifier, LGBMClassifier):
		cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender"]
		cont_columns = ["Income", "Hours"]
	else:
		cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender"]
		cont_columns = ["Age", "Income", "Hours"]
	if isinstance(classifier, LGBMClassifier):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	if isinstance(classifier, LGBMClassifier):
		pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5])
	elif isinstance(classifier, XGBClassifier):
		if name == "XGBoostAuditNA":
			audit_X["Age"] = audit_X["Age"].astype(float)
		pipeline.fit(audit_X, audit_y)
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
Exemplo n.º 23
0
mapper = DataFrameMapper(
    [(cat_column,
      [CategoricalDomain(invalid_value_treatment="as_is"),
       LabelBinarizer()]) for cat_column in cat_columns] +
    [([cont_column],
      [ContinuousDomain(invalid_value_treatment="as_is"),
       StandardScaler()]) for cont_column in cont_columns])

selector = SelectKBest()

classifier = LogisticRegression(multi_class="ovr",
                                penalty="elasticnet",
                                solver="saga",
                                max_iter=1000)

pipeline = PMMLPipeline([("mapper", mapper), ("selector", selector),
                         ("classifier", classifier)])

param_grid = {
    "selector__k": [10, 20, 30],
    "classifier__l1_ratio": [0.7, 0.8, 0.9]
}

searcher = GridSearchCV(estimator=pipeline, param_grid=param_grid)
searcher.fit(df_X, df_y)

print(searcher.best_params_)

best_pipeline = searcher.best_estimator_
best_pipeline.verify(df_X.sample(n=5))

sklearn2pmml(best_pipeline, "GridSearchAudit.pmml")
Exemplo n.º 24
0
print('accuracy(准确率):', accuracy)

maxtrix = sklearn.metrics.confusion_matrix(pred_data['y_test'], pred_data['y_predict'])
plt.matshow(maxtrix)
plt.colorbar()
plt.xlabel('predict type')
plt.ylabel('true type')
plt.show()


"""
    方法二、用网格搜索方法寻找最优模型参数
"""
svc2 = sklearn.svm.SVC()
# 制定待训练的每个参数的若干备选值
param = {"C": [0.2, 0.4, 0.8, 1, 1.2, 1.6, 2],
         "kernel": ['rbf', 'linear'],
         "gamma": ['auto', 0.01, 0.05, 0.2, 0.4, 0.5, 1, 1.6, 2]}
gscv = sklearn.model_selection.GridSearchCV(svc2, param_grid=param, cv=3)
gscv = gscv.fit(X_train, y_train)
print("best score: {}".format(gscv.best_score_))
print("best params: {}".format(gscv.best_params_))


"""
    4.把模型导出至pmml文件,可被用于后续java系统,这里要提前装jdk貌似,暂时运行不了
"""
pipeline = PMMLPipeline([("classifier", svc1)])
pipeline.fit(X_train, y_train)
sklearn2pmml(pipeline, ".\\demo.pmml", with_repr=True)

if __name__ == '__main__':
    samples = get_jewellery_data()
    X = samples.drop(['id'], axis=1)
    Y = samples["id"]
    print("data done!")

    pipeline = PMMLPipeline([
        ('mapper',
         DataFrameMapper([('name',
                           TfidfVectorizer(norm=None,
                                           analyzer="word",
                                           max_features=200,
                                           tokenizer=Splitter())),
                          ('description',
                           TfidfVectorizer(norm=None,
                                           analyzer="word",
                                           max_features=600,
                                           tokenizer=Splitter()))])),
        ('model', SVC(max_iter=10000)
         ),  # train on TF-IDF vectors w/ Linear SVM classifier
    ])
    print("model set done!")

    pipeline.fit(X, Y)
    print("model fit done!")

    sklearn2pmml(pipeline,
                 "../model/pmml_for_jewellery_second.pmml",
                 with_repr=True)
Exemplo n.º 26
0
from sklearn import XGBClassifier, XGBRegressor
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.preprocessing import LabelEncoder

iris_df = pandas.read_csv("iris.csv")
iris_df['species'] = LabelEncoder().fit_transform(iris_df['species'].values)
#iris_df.columns
#['sepal_length', 'sepal_width', 'petal_length', 'petal_width','species']
clf = XGBClassifier(
    silent=0,  #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
    #nthread=4,# cpu 线程数 默认最大
    learning_rate=0.3,  # 如同学习率
    min_child_weight=1,
    # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
    #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
    max_depth=6,  # 构建树的深度,越大越容易过拟合
    gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。
    subsample=1,  # 随机采样训练样本 训练实例的子采样比
    max_delta_step=0,  #最大增量步长,我们允许每个树的权重估计。
    colsample_bytree=1,  # 生成树时进行的列采样
    reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
    objective='multi:softmax',  #多分类的问题 指定学习任务和相应的学习目标
    n_estimators=100,  #树的个数
    seed=1000)
pipeline = PMMLPipeline([("classifier", clf)])
pipeline.fit(iris_df[iris_df.columns.difference(["species"])],
             iris_df["species"])
sklearn2pmml(pipeline, "xgboost.pmml", with_repr=True)
Exemplo n.º 27
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Exemplo n.º 28
0
import pandas

import pandas

iris_df = pandas.read_csv("Iris.csv")

from sklearn.tree import DecisionTreeClassifier
from sklearn2pmml.pipeline import PMMLPipeline

pipeline = PMMLPipeline([("classifier", DecisionTreeClassifier())])
pipeline.fit(iris_df[iris_df.columns.difference(["Species"])],
             iris_df["Species"])

from sklearn2pmml import sklearn2pmml

sklearn2pmml(pipeline, "DecisionTreeIris.pmml", with_repr=True)
Exemplo n.º 29
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["cylinders"], Alias(ExpressionTransformer("X[0] % 2.0 > 0.0"), name = "odd(cylinders)", prefit = True)),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["model_year", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), ConcatTransformer("/"), LabelBinarizer(), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
Exemplo n.º 30
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.datasets import load_iris
import os

iris = load_iris()
data = iris.data
target = iris.target

# os.environ["PATH"] += os.pathsep + 'C:/Program Files/Java/jdk1.8.0_171/bin'
# X=[[1,2,3,1],[2,4,1,5],[7,8,3,6],[4,8,4,7],[2,5,6,9]]
# y=[0,1,0,2,1]

pipeline = PMMLPipeline([("classifier",
                          tree.DecisionTreeClassifier(random_state=9))])
pipeline.fit(data, target)
sklearn2pmml(pipeline, "tree_result.pmml")