示例#1
0
def xgboost_to_pmml(data_X, data_y, par_file, save_model_as):
    """Save Xgboost Model to PMMl file.

    Parameters
    ----------
    data_X : pandas.DataFrame
        Variables of train data.
    date_y : pandas.DataFrame
        Lables of train data.
    par_file : str
        File path of model's parameters.
    save_model_as : str
        File path of PMML.

    Returns
    -------
    None
        Generate PMML file locally as `save_model_as` given.

    Examples
    --------
    >>> xgboost_to_pmml(data_x, data_y, "par.json", "model.pmml")
    """
    # Create Xgboost Model
    with open(par_file, "r") as f:
        par = json.load(f)
    xgb_now = XGBClassifier(**par)
    # Create Pipeline
    pipeline = PMMLPipeline([("classifier", xgb_now)])
    # Fit Model
    pipeline.fit(data_X, data_y)
    # Save Model
    sklearn2pmml(pipeline, save_model_as, with_repr=True)
示例#2
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
示例#3
0
def train_and_save_model(data, model_path):
    """
    利用sklearn2pmml将模型存储为PMML
    """
    model = PMMLPipeline([("regressor", linear_model.LinearRegression())])
    model.fit(data[["x"]], data["y"])
    sklearn2pmml(model, model_path)
示例#4
0
文件: train.py 项目: wshzd/ML
def getFirstContent(dataUrl, modelUrl, modelName):
    training_data = load_files(dataUrl, encoding="utf-8")
    '''
    这是开始提取特征,这里的特征是词频统计。
    '''
    count_vect = CountVectorizer()

    X_train_counts = count_vect.fit_transform(training_data.data)

    '''
    这是开始提取特征,这里的特征是TFIDF特征。
    '''
    tfidf_transformer = TfidfTransformer()

    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    '''
    使用朴素贝叶斯分类,并做出简单的预测
    '''
    mnb_pipeline = PMMLPipeline([("classifier", LogisticRegression())])

    mnb_pipeline.fit(X_train_tfidf, training_data.target)
    
    //保存为pkl格式
    joblib.dump(mnb_pipeline, modelUrl + modelName)
    //保存为pmml格式
    sklearn2pmml(mnb_pipeline, modelUrl + modelName, with_repr = True)

    if (os.path.exists(modelUrl + modelName)):

        return "success"
    else:

        return "fail"
示例#5
0
def build_sentiment(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))),
        ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if (with_proba == True):
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
示例#6
0
def build_iris(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([
        ("union",
         FeatureUnion([("normal_scale",
                        DataFrameMapper([
                            (iris_X.columns.values, ContinuousDomain()),
                        ])),
                       ("log_scale",
                        DataFrameMapper([(iris_X.columns.values,
                                          FunctionTransformer(numpy.log10))]))
                       ])), ("scaler", RobustScaler()),
        ("pca", IncrementalPCA(n_components=3, whiten=True)),
        ("classifier", classifier)
    ])
    pipeline.fit(iris_X, iris_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability(setosa)",
                                      "probability(versicolor)",
                                      "probability(virginica)"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
示例#7
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
示例#8
0
def build_housing(regressor, name, with_kneighbors=False):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = PMMLPipeline([
        ("mapper", mapper),
        ("transformer-pipeline",
         Pipeline([
             ("polynomial",
              PolynomialFeatures(degree=2,
                                 interaction_only=True,
                                 include_bias=False)),
             ("scaler", StandardScaler()),
             ("selector",
              SelectorProxy(
                  SelectPercentile(score_func=f_regression, percentile=35))),
         ])), ("regressor", regressor)
    ])
    pipeline.fit(housing_X, housing_y)
    store_pkl(pipeline, name + ".pkl")
    medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"])
    if (with_kneighbors == True):
        Xt = pipeline_transform(pipeline, housing_X)
        kneighbors = regressor.kneighbors(Xt)
        medv_ids = DataFrame(kneighbors[1] + 1,
                             columns=[
                                 "neighbor(" + str(x + 1) + ")"
                                 for x in range(regressor.n_neighbors)
                             ])
        medv = pandas.concat((medv, medv_ids), axis=1)
    store_csv(medv, name + ".csv")
def save_as_PMML(data, modelPath):
    """
    利用sklearn2pmml将模型存储为PMML
    """
    model = PMMLPipeline([("regressor", linear_model.LinearRegression())])
    model.fit(data[["x"]], data["y"])
    sklearn2pmml(model, "linear.pmml", with_repr=True)
示例#10
0
def train(data_conf, model_conf, **kwargs):
    """Python train method called by AOA framework

    Parameters:
    data_conf (dict): The dataset metadata
    model_conf (dict): The model configuration to use

    Returns:
    None:No return

    """

    # load data & engineer
    iris_df = pd.read_csv(data_conf['location'])
    train, _ = train_test_split(iris_df, test_size=0.5, random_state=42)
    X = train.drop("species", 1)
    y = train['species']

    print("Starting training...")
    # fit model to training data
    classifier = PMMLPipeline([('classifier', RandomForestClassifier())])
    classifier.fit(X, y.values.ravel())
    print("Finished training")

    # export model artefacts to models/ folder
    if not os.path.exists('models'):
        os.makedirs('models')
    sklearn2pmml(classifier, "models/model.pmml")
    print("Saved trained model")
示例#11
0
def pmml(x, Y):
    from sklearn2pmml import PMMLPipeline, sklearn2pmml

    LR_pipeline = PMMLPipeline([
        ("classifier", LogisticRegression())
    ])

    # 训练模型
    LR_pipeline.fit(x, Y)
    sklearn2pmml(LR_pipeline, "LogisticRegression.pmml")
示例#12
0
def build_audit_dict(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()),
                             ("classifier", classifier)])
    pipeline.fit(audit_dict_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_dict_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_dict_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
示例#13
0
def build_iforest_housing_anomaly(iforest, name):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = PMMLPipeline([("mapper", mapper), ("estimator", iforest)])
    pipeline.fit(housing_X)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) == -1,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1),
              name + ".csv")
示例#14
0
def build_auto_na(regressor, name):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] +
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_na_X, auto_na_y)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
示例#15
0
 def test_fit(self):
     pipeline = PMMLPipeline([("estimator", DummyRegressor())])
     self.assertFalse(hasattr(pipeline, "active_fields"))
     self.assertFalse(hasattr(pipeline, "target_fields"))
     X = DataFrame([[1, 0], [2, 0], [3, 0]], columns=["X1", "X2"])
     y = Series([0.5, 1.0, 1.5], name="y")
     pipeline.fit(X, y)
     self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist())
     self.assertEqual("y", pipeline.target_fields.tolist())
     X.columns = ["x1", "x2"]
     pipeline.fit(X, y)
     self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist())
     self.assertEqual("y", pipeline.target_fields.tolist())
示例#16
0
class FirstStep(object):
    def __init__(self):
        self.__iris = load_iris()
        self.__X = pd.DataFrame(self.__iris.data,
                                columns=self.__iris.feature_names)
        self.__y = pd.DataFrame(self.__iris.target, columns=["Species"])
        self.__train = None
        self.__train_label = None
        self.__test = None
        self.__test_one_sample = None
        self.__test_label = None
        self.__mapper = None
        self.__estimator = None
        self.__pipeline = None

    def train_test_split_step(self):
        self.__train, self.__test, self.__train_label, self.__test_label = (
            train_test_split(self.__X, self.__y, test_size=0.2))
        self.__train = self.__train.reset_index(drop=True)
        self.__train_label = self.__train_label.reset_index(drop=True)
        self.__test = self.__test.reset_index(drop=True)
        self.__test_label = self.__train.reset_index(drop=True)

    def feature_engineering_step(self):
        self.__mapper = (DataFrameMapper([([
            "sepal length (cm)", "sepal width (cm)", "petal length (cm)",
            "petal width (cm)"
        ], [StandardScaler()])]))

    def model_train_step(self):
        self.__estimator = DecisionTreeClassifier()

    def pipeline_step(self):
        self.__pipeline = PMMLPipeline([("mapper", self.__mapper),
                                        ("estimator", self.__estimator)])
        self.__pipeline.fit(self.__train, self.__train_label)

    def output_step(self):
        joblib.dump(self.__pipeline,
                    "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z",
                    compress=3)

    def input_step(self):
        self.__pipeline = joblib.load(
            "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z")
        self.__test_one_sample = self.__test[0:1]
        print(self.__pipeline.predict(self.__test))
        # 传入一行记录
        print(self.__pipeline.predict(self.__test_one_sample))
示例#17
0
def build_auto(regressor, name):
	mapper = DataFrameMapper([
		(["cylinders"], CategoricalDomain()),
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["origin"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")
示例#18
0
class XgbModel(object):
    def __init__(self, train, train_label, test, test_label):
        self.__train = train
        self.__train_label = train_label
        self.__test = test
        self.__test_label = test_label
        self.__bst = None
        self.__feat_imp = None
        self.__test_preds = None
        self.__test_predictions = None
        self.__output = None

    def train(self):
        self.__bst = XGBClassifier(objective="binary:logistic")
        self.__bst = PMMLPipeline([("estimator", self.__bst)])
        self.__bst.fit(self.__train,
                       self.__train_label,
                       estimator__eval_metric="auc")

    def predict(self):
        self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1]
        self.__test_predictions = self.__bst.predict(self.__test)

    def feature_importances(self):
        self.__feat_imp = (pd.Series(
            self.__bst.feature_importances_,
            ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False))
        self.__feat_imp.plot(kind="bar", title="Feature Importances")
        plt.ylabel("Feature Importance Score")
        plt.show()

    def evaluate(self):
        print("auc : %.4f" %
              roc_auc_score(self.__test_label, self.__test_preds))
        print("accuracy score : %.4f" %
              accuracy_score(self.__test_label, self.__test_predictions))

    def evaluate_output(self):
        self.__output = np.hstack(
            (self.__test, self.__test_label.reshape(
                (-1, 1)), self.__test_preds.reshape((-1, 1))))
        pd.DataFrame(
            self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv")

    def xgbmodel_output(self):
        joblib.dump(self.__bst,
                    "C:\\Users\\Dell\\Desktop\\bstML.pkl.z",
                    compress=True)
示例#19
0
def build_versicolor(classifier, name, with_proba=True):
    mapper = DataFrameMapper([((versicolor_columns[:-1],
                                [ContinuousDomain(),
                                 RobustScaler()]))])
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("transformer", PolynomialFeatures(degree=3)),
                             ("selector", SelectKBest(k="all")),
                             ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability_0", "probability_1"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
示例#20
0
def build_audit_na(classifier, name, with_proba = True):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] +
		[([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
示例#21
0
def build_svm_housing_anomaly(svm, name):
    mapper = DataFrameMapper([(housing_columns[:-1], ContinuousDomain())])
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("estimator",
                              Pipeline([("first", MaxAbsScaler()),
                                        ("second", svm)]))])
    pipeline.fit(housing_X)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) <= 0,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1),
              name + ".csv")
示例#22
0
def build_wheat(kmeans, name, with_affinity=True):
    mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())])
    pipeline = PMMLPipeline([("mapper", mapper), ("scaler", MinMaxScaler()),
                             ("clusterer", kmeans)])
    pipeline.fit(wheat_X)
    store_pkl(pipeline, name + ".pkl")
    cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"])
    if (with_affinity == True):
        Xt = pipeline_transform(pipeline, wheat_X)
        affinity_0 = kmeans_distance(kmeans, 0, Xt)
        affinity_1 = kmeans_distance(kmeans, 1, Xt)
        affinity_2 = kmeans_distance(kmeans, 2, Xt)
        cluster_affinity = DataFrame(
            numpy.transpose([affinity_0, affinity_1, affinity_2]),
            columns=["affinity(0)", "affinity(1)", "affinity(2)"])
        cluster = pandas.concat((cluster, cluster_affinity), axis=1)
    store_csv(cluster, name + ".csv")
示例#23
0
    def model_wrapper_fit(self):
        self.__model_list.extend([
            self.__gradient_boosting_classifier,
            self.__random_forest_classifier, self.__logistic_regression,
            self.__k_neighbors_classifier, self.__extra_tree_classifier,
            self.__xgb_classifier
        ])

        for model in self.__model_list:
            temp = PMMLPipeline([("estimator", model)])
            temp.fit(self.__train, self.__train_label)
            self.__pmml_model_list.append(temp)

        print(
            self.__logistic_regression.fit(self.__train,
                                           self.__train_label).coef_)
        print(
            self.__logistic_regression.fit(self.__train,
                                           self.__train_label).intercept_)
示例#24
0
def build_iris(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        (iris_X.columns.values, ContinuousDomain()),
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("scaler", RobustScaler()),
                             ("pca", IncrementalPCA(n_components=3,
                                                    whiten=True)),
                             ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability_setosa",
                                      "probability_versicolor",
                                      "probability_virginica"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
示例#25
0
	def test_fit_verify(self):
		pipeline = PMMLPipeline([("estimator", DummyRegressor())])
		self.assertFalse(hasattr(pipeline, "active_fields"))
		self.assertFalse(hasattr(pipeline, "target_fields"))
		X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"])
		y = Series([0.5, 1.0, 1.5], name = "y")
		pipeline.fit(X, y)
		self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist())
		self.assertEqual("y", pipeline.target_fields.tolist())
		X.columns = ["x1", "x2"]
		pipeline.fit(X, y)
		self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist())
		self.assertEqual("y", pipeline.target_fields.tolist())
		self.assertFalse(hasattr(pipeline, "verification"))
		pipeline.verify(X.sample(2))
		self.assertEqual(2, len(pipeline.verification.active_values))
		self.assertEqual(2, len(pipeline.verification.target_values))
		X.columns = ["x2", "x1"]
		with self.assertRaises(ValueError):
			pipeline.verify(X.sample(2))
示例#26
0
def build_wheat(kmeans, name, with_affinity=True):
    mapper = DataFrameMapper([([
        "Area", "Perimeter", "Compactness", "Kernel.Length", "Kernel.Width",
        "Asymmetry", "Groove.Length"
    ], ContinuousDomain())])
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("transformer", FunctionTransformer(numpy.log10)),
                             ("scaler", MinMaxScaler()),
                             ("clusterer", kmeans)])
    pipeline.fit(wheat_X)
    store_pkl(pipeline, name + ".pkl")
    cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"])
    if (with_affinity == True):
        Xt = pipeline_transform(pipeline, wheat_X)
        affinity_0 = kmeans_distance(kmeans, 0, Xt)
        affinity_1 = kmeans_distance(kmeans, 1, Xt)
        affinity_2 = kmeans_distance(kmeans, 2, Xt)
        cluster_affinity = DataFrame(
            numpy.transpose([affinity_0, affinity_1, affinity_2]),
            columns=["affinity_0", "affinity_1", "affinity_2"])
        cluster = pandas.concat((cluster, cluster_affinity), axis=1)
    store_csv(cluster, name + ".csv")
示例#27
0
def build_audit_na(classifier, name, with_proba=True):
    employment_mapping = {
        "Consultant": "Private",
        "PSFederal": "Public",
        "PSLocal": "Public",
        "PSState": "Public",
        "SelfEmp": "Private",
        "Private": "Private"
    }
    gender_mapping = {"Female": 0, "Male": 1}
    mapper = DataFrameMapper(
        [([column], [ContinuousDomain(missing_values=None),
                     Imputer()])
         for column in ["Age", "Income", "Hours"]] + [("Employment", [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             LookupTransformer(employment_mapping, "Other"),
             PMMLLabelBinarizer()
         ])] + [([column], [
             CategoricalDomain(missing_values=None),
             CategoricalImputer(),
             PMMLLabelBinarizer()
         ]) for column in ["Education", "Marital", "Occupation"]] +
        [("Gender", [
            CategoricalDomain(missing_values=None),
            CategoricalImputer(),
            LookupTransformer(gender_mapping, None)
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
def save_pmml_model(model_path):
    '''
    用xgboost进行训练,对训练数据未进行采样,保存pmml文件供java解析
    :return:
    '''
    dataset = pd.read_csv(
        '/Users/looker/project/xmodel/same_product_judge/data/five_col_training_data.csv'
    )
    # X = dataset.ix[:, dataset.columns != 'label'].values
    # Y = dataset.ix[:, dataset.columns == 'label'].values.flatten().astype(np.int32)
    X = dataset.ix[:, dataset.columns != 'label']
    Y = dataset.ix[:, dataset.columns == 'label'].values.ravel()
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=33)
    ### fit model for train data
    model = XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,  # 树的个数--1000棵树建立xgboost
        max_depth=6,  # 树的深度
        min_child_weight=1,  # 叶子节点最小权重
        gamma=0.,  # 惩罚项中叶子结点个数前的参数
        subsample=1,  # 随机选择样本建立决策树
        colsample_btree=1,  # 随机选取特征建立决策树
        scale_pos_weight=1,  # 解决样本个数不平衡的问题
        random_state=27,  # 随机数
        objective='binary:logistic')
    # model.fit(x_train, y_train, eval_set=[(x_test, y_test)], early_stopping_rounds=10,
    #           verbose=True)
    # 保存为pmml格式模型
    mapper = DataFrameMapper([(['f0'], None), (['f1'], None), (['f2'], None),
                              (['f3'], None)])
    pipeline = PMMLPipeline([('mapper', mapper), ("classifier", model)])
    pipeline.fit(X, Y)
    sklearn2pmml(pipeline, model_path, with_repr=True)
示例#29
0
import xgboost
from sklearn import datasets
from sklearn2pmml import sklearn2pmml
from sklearn2pmml import PMMLPipeline

boston = datasets.load_boston()
X = boston.data
y = boston.target
feature_names = boston.feature_names
model = xgboost.XGBRegressor(learning_rate=0.1,
                             n_estimators=10,
                             max_depth=10,
                             silent=False)

boston_pipeline = PMMLPipeline([("regressor", model)])
boston_pipeline.active_fields = feature_names
boston_pipeline.fit(X, y)

sklearn2pmml(boston_pipeline, "boston.pmml", with_repr=True, debug=True)
"""
@author:duke.du
@time:2018/11/29 19:19
@file:dust_pmml.py
@contact: [email protected]
@function:训练模型,生成PMML文件
"""
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn2pmml import PMMLPipeline, sklearn2pmml  # No problem

iris = load_iris()

# 创建带有特征名称的 DataFrame
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
print("iris' size :", iris_df.shape)

# 创建模型管道
iris_pipeline = PMMLPipeline([
    ("classifier", RandomForestClassifier())
])

# 训练模型
iris_pipeline.fit(iris_df, iris.target)

# 导出模型到 RandomForestClassifier_Iris.pmml 文件
sklearn2pmml(iris_pipeline, "pmml/RandomForestClassifier_Iris.pmml")