def getFirstContent(dataUrl, modelUrl, modelName): training_data = load_files(dataUrl, encoding="utf-8") ''' 这是开始提取特征,这里的特征是词频统计。 ''' count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(training_data.data) ''' 这是开始提取特征,这里的特征是TFIDF特征。 ''' tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) ''' 使用朴素贝叶斯分类,并做出简单的预测 ''' mnb_pipeline = PMMLPipeline([("classifier", LogisticRegression())]) mnb_pipeline.fit(X_train_tfidf, training_data.target) //保存为pkl格式 joblib.dump(mnb_pipeline, modelUrl + modelName) //保存为pmml格式 sklearn2pmml(mnb_pipeline, modelUrl + modelName, with_repr = True) if (os.path.exists(modelUrl + modelName)): return "success" else: return "fail"
def convert_sklearn_to_pmml(model, pmml, feature_names=None, target_name=None): pipeline = PMMLPipeline([("regressor", model)]) if feature_names is not None: pipeline.active_fields = feature_names if target_name is not None: pipeline.target_field = target_name sklearn2pmml(pipeline, pmml, with_repr=True, debug=True)
def train(data_conf, model_conf, **kwargs): """Python train method called by AOA framework Parameters: data_conf (dict): The dataset metadata model_conf (dict): The model configuration to use Returns: None:No return """ # load data & engineer iris_df = pd.read_csv(data_conf['location']) train, _ = train_test_split(iris_df, test_size=0.5, random_state=42) X = train.drop("species", 1) y = train['species'] print("Starting training...") # fit model to training data classifier = PMMLPipeline([('classifier', RandomForestClassifier())]) classifier.fit(X, y.values.ravel()) print("Finished training") # export model artefacts to models/ folder if not os.path.exists('models'): os.makedirs('models') sklearn2pmml(classifier, "models/model.pmml") print("Saved trained model")
def xgboost_to_pmml(data_X, data_y, par_file, save_model_as): """Save Xgboost Model to PMMl file. Parameters ---------- data_X : pandas.DataFrame Variables of train data. date_y : pandas.DataFrame Lables of train data. par_file : str File path of model's parameters. save_model_as : str File path of PMML. Returns ------- None Generate PMML file locally as `save_model_as` given. Examples -------- >>> xgboost_to_pmml(data_x, data_y, "par.json", "model.pmml") """ # Create Xgboost Model with open(par_file, "r") as f: par = json.load(f) xgb_now = XGBClassifier(**par) # Create Pipeline pipeline = PMMLPipeline([("classifier", xgb_now)]) # Fit Model pipeline.fit(data_X, data_y) # Save Model sklearn2pmml(pipeline, save_model_as, with_repr=True)
def save_as_PMML(data, modelPath): """ 利用sklearn2pmml将模型存储为PMML """ model = PMMLPipeline([("regressor", linear_model.LinearRegression())]) model.fit(data[["x"]], data["y"]) sklearn2pmml(model, "linear.pmml", with_repr=True)
def train_and_save_model(data, model_path): """ 利用sklearn2pmml将模型存储为PMML """ model = PMMLPipeline([("regressor", linear_model.LinearRegression())]) model.fit(data[["x"]], data["y"]) sklearn2pmml(model, model_path)
def build_housing(regressor, name, with_kneighbors=False): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([ ("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()), ("selector", SelectorProxy( SelectPercentile(score_func=f_regression, percentile=35))), ])), ("regressor", regressor) ]) pipeline.fit(housing_X, housing_y) store_pkl(pipeline, name + ".pkl") medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"]) if (with_kneighbors == True): Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns=[ "neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv")
def save_model(model, feature_names, model_path, label_text="label"): p, extension = os.path.splitext(model_path) model.feature_names = feature_names pickle_path = p + ".pkl" if extension == ".pmml": try: from sklearn2pmml import sklearn2pmml, PMMLPipeline except ImportError: raise ImportError( "You need to install `sklearn2pmml` to store models in pmml format" ) pipeline = PMMLPipeline([("model", model)]) pipeline.target_field = label_text pipeline.active_fields = np.array(feature_names) sklearn2pmml(pipeline, model_path) elif extension == ".onnx": try: from skl2onnx import convert_sklearn from skl2onnx.common.data_types import FloatTensorType from skl2onnx.helpers.onnx_helper import select_model_inputs_outputs from onnx.onnx_pb import StringStringEntryProto except ImportError: raise ImportError( "You need to install `skl2onnx` to store models in onnx format" ) onnx = convert_sklearn( model, name=label_text, initial_types=[("input", FloatTensorType((None, len(feature_names))))], doc_string="Model created by aict-tools to estimate {}".format(label_text), ) # this makes sure we only get the scores and that they are numpy arrays and not # a list of dicts. # must come before setting metadata as it clears the metadata_props if hasattr(model, "predict_proba"): onnx = select_model_inputs_outputs(onnx, ["probabilities"]) metadata = dict( model_author="aict-tools", aict_tools_version=__version__, feature_names=",".join(feature_names), model_type="classifier" if is_classifier(model) else "regressor", ) for key, value in metadata.items(): onnx.metadata_props.append(StringStringEntryProto(key=key, value=value)) with open(model_path, "wb") as f: f.write(onnx.SerializeToString()) else: pickle_path = model_path # Always store the pickle dump,just in case joblib.dump(model, pickle_path, compress=4)
def pmml(x, Y): from sklearn2pmml import PMMLPipeline, sklearn2pmml LR_pipeline = PMMLPipeline([ ("classifier", LogisticRegression()) ]) # 训练模型 LR_pipeline.fit(x, Y) sklearn2pmml(LR_pipeline, "LogisticRegression.pmml")
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] + [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def pickle_model(classifier, feature_names, model_path, label_text='label'): p, extension = path.splitext(model_path) classifier.feature_names = feature_names if (extension == '.pmml'): joblib.dump(classifier, p + '.pkl', compress=4) pipeline = PMMLPipeline([('classifier', classifier)]) pipeline.target_field = label_text pipeline.active_fields = np.array(feature_names) sklearn2pmml(pipeline, model_path) else: joblib.dump(classifier, model_path, compress=4)
class FirstStep(object): def __init__(self): self.__iris = load_iris() self.__X = pd.DataFrame(self.__iris.data, columns=self.__iris.feature_names) self.__y = pd.DataFrame(self.__iris.target, columns=["Species"]) self.__train = None self.__train_label = None self.__test = None self.__test_one_sample = None self.__test_label = None self.__mapper = None self.__estimator = None self.__pipeline = None def train_test_split_step(self): self.__train, self.__test, self.__train_label, self.__test_label = ( train_test_split(self.__X, self.__y, test_size=0.2)) self.__train = self.__train.reset_index(drop=True) self.__train_label = self.__train_label.reset_index(drop=True) self.__test = self.__test.reset_index(drop=True) self.__test_label = self.__train.reset_index(drop=True) def feature_engineering_step(self): self.__mapper = (DataFrameMapper([([ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], [StandardScaler()])])) def model_train_step(self): self.__estimator = DecisionTreeClassifier() def pipeline_step(self): self.__pipeline = PMMLPipeline([("mapper", self.__mapper), ("estimator", self.__estimator)]) self.__pipeline.fit(self.__train, self.__train_label) def output_step(self): joblib.dump(self.__pipeline, "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z", compress=3) def input_step(self): self.__pipeline = joblib.load( "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z") self.__test_one_sample = self.__test[0:1] print(self.__pipeline.predict(self.__test)) # 传入一行记录 print(self.__pipeline.predict(self.__test_one_sample))
def __init__(self): print('读取语料库:') seed_list, content_list = self.get_data( './data/豆瓣') # 文件格式:老无所依\t差评\t我不能因为它得了奥斯卡就说明它好看,我不能因。。。 print('\t' + '好评数:' + str(len(seed_list)) + ' 差评数:' + str(len(content_list))) seed_y = [0 for i in range(0, len(seed_list))] content_y = [1 for i in range(0, len(content_list))] queries = content_list + seed_list y = content_y + seed_y # 数据矢量化 self.vectorizer = TfidfVectorizer(tokenizer=self.get_ngrams) X = self.vectorizer.fit_transform(queries) print('向量化后维度:' + str(X.shape)) print('划分训练集、测试集...') # 使用 train_test_split 分割 X y 列表 # X_train矩阵的数目对应 y_train列表的数目(一一对应) -->> 用来训练模型 # X_test矩阵的数目对应 (一一对应) -->> 用来测试模型的准确性 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46) print('划分完成,训练集开始训练分类器...') #self.model = LogisticRegression() self.model = svm.SVC() #self.model=MultinomialNB(alpha=0.001) self.pipeline = PMMLPipeline([("classifier", self.model)]) self.pipeline.fit(X_train, y_train) joblib.dump(self.pipeline, "./result/classifier.pkl.z", compress=9) # compress压缩程度 print('训练完毕!!! 测试集开始预测结果...') predict = self.pipeline.predict(X_test) print("精度:{0:f}".format( metrics.precision_score(y_test, predict, average="weighted"))) print("召回:{0:f}".format( metrics.recall_score(y_test, predict, average="weighted"))) print("f1-score:{0:f}".format( metrics.f1_score(y_test, predict, average="weighted"))) print("预测完毕!!!!") print('***********************************************************') print('***********************************************************')
def build_auto(regressor, name): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
class XgbModel(object): def __init__(self, train, train_label, test, test_label): self.__train = train self.__train_label = train_label self.__test = test self.__test_label = test_label self.__bst = None self.__feat_imp = None self.__test_preds = None self.__test_predictions = None self.__output = None def train(self): self.__bst = XGBClassifier(objective="binary:logistic") self.__bst = PMMLPipeline([("estimator", self.__bst)]) self.__bst.fit(self.__train, self.__train_label, estimator__eval_metric="auc") def predict(self): self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1] self.__test_predictions = self.__bst.predict(self.__test) def feature_importances(self): self.__feat_imp = (pd.Series( self.__bst.feature_importances_, ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False)) self.__feat_imp.plot(kind="bar", title="Feature Importances") plt.ylabel("Feature Importance Score") plt.show() def evaluate(self): print("auc : %.4f" % roc_auc_score(self.__test_label, self.__test_preds)) print("accuracy score : %.4f" % accuracy_score(self.__test_label, self.__test_predictions)) def evaluate_output(self): self.__output = np.hstack( (self.__test, self.__test_label.reshape( (-1, 1)), self.__test_preds.reshape((-1, 1)))) pd.DataFrame( self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv") def xgbmodel_output(self): joblib.dump(self.__bst, "C:\\Users\\Dell\\Desktop\\bstML.pkl.z", compress=True)
def gen_pmml_to_hdfs(sc, model, args): """ 生成 pmml,pkl 到 hdfs """ # 先将pmml文件生成到driver的tmp目录下,然后再将其上传至HDFS # 目录名称 /tmp/时间戳/文件名 print("===========> 保存模型文件到 HDFS") pmml_model_name = constants.PMML_NAME pkl_model_name = constants.PKL_NAME dir_name = "/tmp/" + str(time.time()) args.tmp_dir = dir_name os.mkdir(dir_name) # 保存为pmml文件 pipeline = PMMLPipeline([("classifier", model)]) sklearn2pmml(pipeline, dir_name + os.sep + pmml_model_name, with_repr=True) joblib.dump(model, dir_name + os.sep + pkl_model_name) # 上传文件至HDFS with open(dir_name + os.sep + pmml_model_name, "r") as f1, \ open(dir_name + os.sep + pkl_model_name, "rb") as f2: data1 = f1.read() data2 = f2.read() save_data.write_data_to_cluster(sc, args.export_dir + os.sep + pmml_model_name, data1) save_data.write_data_to_cluster(sc, args.model_dir + os.sep + pkl_model_name, data2, is_text_file=False) # 删除临时文件 os.remove(dir_name + os.sep + pmml_model_name) os.remove(dir_name + os.sep + pkl_model_name)
def __model_to_pmml__(): pipeline = PMMLPipeline([("regressor", self.regressor)]) sklearn2pmml(pipeline, "pmml", with_repr=True) print('creating pmml') # Read in the file with open('pmml', 'r') as file: filedata = file.read() print('finding matches') # Replace x[1-...] with actual column names m = re.findall('x\-?\d+', filedata) matches = [] print('sorting matches') for match in m: if match in matches: break matches.append(match) feature_cols = list( data_df.columns.difference( ["in_set", "smiles", "id", self.target_name])) matched_dict = dict(zip(matches, feature_cols)) print('replacing') for match, feat in matched_dict.items(): filedata = filedata.replace(match, f'{feat}') # Replace y with target name filedata = filedata.replace('y', f'{self.target_name}') print('rewrite to file') # Write the file out again with open('pmml', 'w') as file: file.write(filedata)
def get_training_data(con): data = pd.read_sql("""select user_responses.id as id, drink_name as drink, user_responses.question_name as question_name, question_choices.choice as choice, session_id from user_responses inner join question_choices on user_responses.question_choice = question_choices.id""" , con=con , index_col='id') print(data) data = data.pivot(index='session_id', columns='question_name', values=['choice', 'drink']) print(data) pipeline = PMMLPipeline([ ("transformation", DataFrameMapper([ (["hotdog"], [CategoricalDomain(), LabelBinarizer()]) , (["tp"], [CategoricalDomain(), LabelBinarizer()]) , (["personality"], [CategoricalDomain(), LabelBinarizer()]) ])), ("classifier", GaussianNB()) ]) return data, pipeline
def pickle_model(classifier, feature_names, model_path, label_text='label'): p, extension = os.path.splitext(model_path) classifier.feature_names = feature_names if (extension == '.pmml'): joblib.dump(classifier, p + '.pkl', compress=4) pipeline = PMMLPipeline([ ('classifier', classifier) ]) pipeline.target_field = label_text pipeline.active_fields = np.array(feature_names) sklearn2pmml(pipeline, model_path) else: joblib.dump(classifier, model_path, compress=4)
def build_wheat(kmeans, name, with_affinity=True): mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if (with_affinity == True): Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity(0)", "affinity(1)", "affinity(2)"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def model_wrapper_fit(self): self.__model_list.extend([ self.__gradient_boosting_classifier, self.__random_forest_classifier, self.__logistic_regression, self.__k_neighbors_classifier, self.__extra_tree_classifier, self.__xgb_classifier ]) for model in self.__model_list: temp = PMMLPipeline([("estimator", model)]) temp.fit(self.__train, self.__train_label) self.__pmml_model_list.append(temp) print( self.__logistic_regression.fit(self.__train, self.__train_label).coef_) print( self.__logistic_regression.fit(self.__train, self.__train_label).intercept_)
def test_fit_verify(self): pipeline = PMMLPipeline([("estimator", DummyRegressor())]) self.assertFalse(hasattr(pipeline, "active_fields")) self.assertFalse(hasattr(pipeline, "target_fields")) X = DataFrame([[1, 0], [2, 0], [3, 0]], columns = ["X1", "X2"]) y = Series([0.5, 1.0, 1.5], name = "y") pipeline.fit(X, y) self.assertEqual(["X1", "X2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) X.columns = ["x1", "x2"] pipeline.fit(X, y) self.assertEqual(["x1", "x2"], pipeline.active_fields.tolist()) self.assertEqual("y", pipeline.target_fields.tolist()) self.assertFalse(hasattr(pipeline, "verification")) pipeline.verify(X.sample(2)) self.assertEqual(2, len(pipeline.verification.active_values)) self.assertEqual(2, len(pipeline.verification.target_values)) X.columns = ["x2", "x1"] with self.assertRaises(ValueError): pipeline.verify(X.sample(2))
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_sentiment(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if (with_proba == True): score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_iris(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("union", FeatureUnion([("normal_scale", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ])), ("log_scale", DataFrameMapper([(iris_X.columns.values, FunctionTransformer(numpy.log10))])) ])), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def save_model_to_local_file(booster, model_params, meta, filename): from sklearn2pmml import PMMLPipeline, sklearn2pmml try: from xgboost.compat import XGBoostLabelEncoder except: # noqa: E722 # xgboost==0.82.0 does not have XGBoostLabelEncoder # in xgboost.compat.py from xgboost.sklearn import XGBLabelEncoder as XGBoostLabelEncoder objective = model_params.get("objective") bst_meta = dict() if objective.startswith("binary:") or objective.startswith("multi:"): if objective.startswith("binary:"): num_class = 2 else: num_class = model_params.get("num_class") assert num_class is not None and num_class > 0, \ "num_class should not be None" # To fake a trained XGBClassifier, there must be "_le", "classes_", # inside XGBClassifier. See here: # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356 model = xgb.XGBClassifier() label_encoder = XGBoostLabelEncoder() label_encoder.fit(list(range(num_class))) model._le = label_encoder model.classes_ = model._le.classes_ bst_meta["_le"] = {"classes_": model.classes_.tolist()} bst_meta["classes_"] = model.classes_.tolist() elif objective.startswith("reg:"): model = xgb.XGBRegressor() elif objective.startswith("rank:"): model = xgb.XGBRanker() else: raise ValueError( "Not supported objective {} for saving PMML".format(objective)) model_type = type(model).__name__ bst_meta["type"] = model_type # Meta data is needed for saving sklearn pipeline. See here: # https://github.com/dmlc/xgboost/blob/d19cec70f1b40ea1e1a35101ca22e46dd4e4eecd/python-package/xgboost/sklearn.py#L356 booster.set_attr(scikit_learn=json.dumps(bst_meta)) booster.save_model(filename) save_model_metadata("model_meta.json", meta) booster.set_attr(scikit_learn=None) model.load_model(filename) pipeline = PMMLPipeline([(model_type, model)]) sklearn2pmml(pipeline, "{}.pmml".format(filename))
def get_model(PARAMS): """ Get model according to given parameters. :param PARAMS: :return: """ estimator = GradientBoostingClassifier() for k in PARAMS: if hasattr(estimator, k): setattr(estimator, k, PARAMS.get(k)) pipeline = PMMLPipeline([('estimator', estimator)]) return pipeline
def get_sample_data(con): data = pd.read_sql("select * from sample_training_data" , con=con , index_col="id") pipeline = PMMLPipeline([ ("transformation", DataFrameMapper([ (["hotdog"], [CategoricalDomain(), LabelBinarizer()]), (["tp"], [CategoricalDomain(), LabelBinarizer()]) ])), ("classifier", GaussianNB()) ]) return data, pipeline
def build_wheat(kmeans, name, with_affinity=True): mapper = DataFrameMapper([([ "Area", "Perimeter", "Compactness", "Kernel.Length", "Kernel.Width", "Asymmetry", "Groove.Length" ], ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("transformer", FunctionTransformer(numpy.log10)), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if (with_affinity == True): Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity_0", "affinity_1", "affinity_2"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def get_model(PARAMS): """ Get model according to given parameters. :param PARAMS: :return: """ estimator = LogisticRegression() for k in PARAMS: if hasattr(estimator, k): setattr(estimator, k, PARAMS.get(k)) pipeline = PMMLPipeline([('estimator', estimator)]) return pipeline