class FirstStep(object): def __init__(self): self.__iris = load_iris() self.__X = pd.DataFrame(self.__iris.data, columns=self.__iris.feature_names) self.__y = pd.DataFrame(self.__iris.target, columns=["Species"]) self.__train = None self.__train_label = None self.__test = None self.__test_one_sample = None self.__test_label = None self.__mapper = None self.__estimator = None self.__pipeline = None def train_test_split_step(self): self.__train, self.__test, self.__train_label, self.__test_label = ( train_test_split(self.__X, self.__y, test_size=0.2)) self.__train = self.__train.reset_index(drop=True) self.__train_label = self.__train_label.reset_index(drop=True) self.__test = self.__test.reset_index(drop=True) self.__test_label = self.__train.reset_index(drop=True) def feature_engineering_step(self): self.__mapper = (DataFrameMapper([([ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], [StandardScaler()])])) def model_train_step(self): self.__estimator = DecisionTreeClassifier() def pipeline_step(self): self.__pipeline = PMMLPipeline([("mapper", self.__mapper), ("estimator", self.__estimator)]) self.__pipeline.fit(self.__train, self.__train_label) def output_step(self): joblib.dump(self.__pipeline, "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z", compress=3) def input_step(self): self.__pipeline = joblib.load( "C:\\Users\\Dell\\Desktop\\pipeline.pkl.z") self.__test_one_sample = self.__test[0:1] print(self.__pipeline.predict(self.__test)) # 传入一行记录 print(self.__pipeline.predict(self.__test_one_sample))
def build_sentiment(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectorProxy(SelectPercentile(chi2, percentile=10))), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if (with_proba == True): score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def build_audit(classifier, name, with_proba=True): mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Employment", [ LabelBinarizer(), SelectFromModel(EstimatorProxy( DecisionTreeClassifier(random_state=13)), threshold="1.25 * mean") ]), ("Education", [ LabelBinarizer(), SelectorProxy( SelectFromModel(EstimatorProxy( RandomForestClassifier(random_state=13, n_estimators=3)), threshold="median")) ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]), ("Occupation", [LabelBinarizer(), SelectorProxy(SelectKBest(k=3))]), ("Income", ContinuousDomain()), ("Gender", LabelEncoder()), ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain()) ]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_audit(classifier, name, with_proba = True, **kwargs): continuous_mapper = DataFrameMapper([ ("Age", ContinuousDomain()), ("Income", ContinuousDomain()), ("Hours", ContinuousDomain()) ]) categorical_mapper = DataFrameMapper([ ("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]), ("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]), ("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), ("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]), ("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]), ("Deductions", [CategoricalDomain(), LabelEncoder()]), ]) pipeline = PMMLPipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_iris(classifier, name, with_proba=True): pipeline = PMMLPipeline([ ("union", FeatureUnion([("normal_scale", DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ])), ("log_scale", DataFrameMapper([(iris_X.columns.values, FunctionTransformer(numpy.log10))])) ])), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier) ]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability(setosa)", "probability(versicolor)", "probability(virginica)" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_housing(regressor, name, with_kneighbors=False): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([ ("mapper", mapper), ("transformer-pipeline", Pipeline([ ("polynomial", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), ("scaler", StandardScaler()), ("selector", SelectorProxy( SelectPercentile(score_func=f_regression, percentile=35))), ])), ("regressor", regressor) ]) pipeline.fit(housing_X, housing_y) store_pkl(pipeline, name + ".pkl") medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"]) if (with_kneighbors == True): Xt = pipeline_transform(pipeline, housing_X) kneighbors = regressor.kneighbors(Xt) medv_ids = DataFrame(kneighbors[1] + 1, columns=[ "neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors) ]) medv = pandas.concat((medv, medv_ids), axis=1) store_csv(medv, name + ".csv")
def build_audit_dict(classifier, name, with_proba=True): pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()), ("classifier", classifier)]) pipeline.fit(audit_dict_X, audit_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_dict_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_dict_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
def build_auto_na(regressor, name): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] + [([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_na_X, auto_na_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
def build_iforest_housing_anomaly(iforest, name): mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("estimator", iforest)]) pipeline.fit(housing_X) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) == -1, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name + ".csv")
def build_auto(regressor, name): mapper = DataFrameMapper([ (["cylinders"], CategoricalDomain()), (["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), Imputer(missing_values = "NaN"), StandardScaler()]), (["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects (["origin"], OneHotEncoder()) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name + ".csv")
class XgbModel(object): def __init__(self, train, train_label, test, test_label): self.__train = train self.__train_label = train_label self.__test = test self.__test_label = test_label self.__bst = None self.__feat_imp = None self.__test_preds = None self.__test_predictions = None self.__output = None def train(self): self.__bst = XGBClassifier(objective="binary:logistic") self.__bst = PMMLPipeline([("estimator", self.__bst)]) self.__bst.fit(self.__train, self.__train_label, estimator__eval_metric="auc") def predict(self): self.__test_preds = self.__bst.predict_proba(self.__test)[:, 1] self.__test_predictions = self.__bst.predict(self.__test) def feature_importances(self): self.__feat_imp = (pd.Series( self.__bst.feature_importances_, ["gbc", "rf", "ab", "lr"]).sort_values(ascending=False)) self.__feat_imp.plot(kind="bar", title="Feature Importances") plt.ylabel("Feature Importance Score") plt.show() def evaluate(self): print("auc : %.4f" % roc_auc_score(self.__test_label, self.__test_preds)) print("accuracy score : %.4f" % accuracy_score(self.__test_label, self.__test_predictions)) def evaluate_output(self): self.__output = np.hstack( (self.__test, self.__test_label.reshape( (-1, 1)), self.__test_preds.reshape((-1, 1)))) pd.DataFrame( self.__output).to_csv("C:\\Users\\Dell\\Desktop\\output.csv") def xgbmodel_output(self): joblib.dump(self.__bst, "C:\\Users\\Dell\\Desktop\\bstML.pkl.z", compress=True)
def build_versicolor(classifier, name, with_proba=True): mapper = DataFrameMapper([((versicolor_columns[:-1], [ContinuousDomain(), RobustScaler()]))]) pipeline = PMMLPipeline([("mapper", mapper), ("transformer", PolynomialFeatures(degree=3)), ("selector", SelectKBest(k="all")), ("classifier", classifier)]) pipeline.fit(versicolor_X, versicolor_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns=["probability_0", "probability_1"]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_svm_housing_anomaly(svm, name): mapper = DataFrameMapper([(housing_columns[:-1], ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("estimator", Pipeline([("first", MaxAbsScaler()), ("second", svm)]))]) pipeline.fit(housing_X) store_pkl(pipeline, name + ".pkl") decisionFunction = DataFrame(pipeline.decision_function(housing_X), columns=["decisionFunction"]) outlier = DataFrame(pipeline.predict(housing_X) <= 0, columns=["outlier" ]).replace(True, "true").replace(False, "false") store_csv(pandas.concat([decisionFunction, outlier], axis=1), name + ".csv")
def build_audit_na(classifier, name, with_proba = True): mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [([column], [CategoricalDomain(missing_values = None), CategoricalImputer(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("classifier", classifier) ]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"]) if(with_proba == True): adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name + ".csv")
def build_wheat(kmeans, name, with_affinity=True): mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if (with_affinity == True): Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity(0)", "affinity(1)", "affinity(2)"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def build_iris(classifier, name, with_proba=True): mapper = DataFrameMapper([ (iris_X.columns.values, ContinuousDomain()), ]) pipeline = PMMLPipeline([("mapper", mapper), ("scaler", RobustScaler()), ("pca", IncrementalPCA(n_components=3, whiten=True)), ("classifier", classifier)]) pipeline.fit(iris_X, iris_y) store_pkl(pipeline, name + ".pkl") species = DataFrame(pipeline.predict(iris_X), columns=["Species"]) if (with_proba == True): species_proba = DataFrame(pipeline.predict_proba(iris_X), columns=[ "probability_setosa", "probability_versicolor", "probability_virginica" ]) species = pandas.concat((species, species_proba), axis=1) store_csv(species, name + ".csv")
def build_wheat(kmeans, name, with_affinity=True): mapper = DataFrameMapper([([ "Area", "Perimeter", "Compactness", "Kernel.Length", "Kernel.Width", "Asymmetry", "Groove.Length" ], ContinuousDomain())]) pipeline = PMMLPipeline([("mapper", mapper), ("transformer", FunctionTransformer(numpy.log10)), ("scaler", MinMaxScaler()), ("clusterer", kmeans)]) pipeline.fit(wheat_X) store_pkl(pipeline, name + ".pkl") cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"]) if (with_affinity == True): Xt = pipeline_transform(pipeline, wheat_X) affinity_0 = kmeans_distance(kmeans, 0, Xt) affinity_1 = kmeans_distance(kmeans, 1, Xt) affinity_2 = kmeans_distance(kmeans, 2, Xt) cluster_affinity = DataFrame( numpy.transpose([affinity_0, affinity_1, affinity_2]), columns=["affinity_0", "affinity_1", "affinity_2"]) cluster = pandas.concat((cluster, cluster_affinity), axis=1) store_csv(cluster, name + ".csv")
def build_audit_na(classifier, name, with_proba=True): employment_mapping = { "Consultant": "Private", "PSFederal": "Public", "PSLocal": "Public", "PSState": "Public", "SelfEmp": "Private", "Private": "Private" } gender_mapping = {"Female": 0, "Male": 1} mapper = DataFrameMapper( [([column], [ContinuousDomain(missing_values=None), Imputer()]) for column in ["Age", "Income", "Hours"]] + [("Employment", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(employment_mapping, "Other"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None), CategoricalImputer(), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [("Gender", [ CategoricalDomain(missing_values=None), CategoricalImputer(), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv")
parameters = { 'eta': 0.3, 'silent': True, # option for logging 'objective': 'multi:softprob', # error evaluation for multiclass tasks 'num_class': 3, # number of classes to predic 'max_depth': 3 # depth of the trees in the boosting process } num_round = 20 # the number of training iterations model = xgb.XGBClassifier(**parameters) # model.fit(X_train, y_train) # preds = model.predict(X_test) default_mapper = DataFrameMapper([(i, None) for i in feat_names]) pipeline = PMMLPipeline([('mapper', default_mapper), ("classifier", model)]) pipeline.fit(X_train, y_train) preds = pipeline.predict(X_test) y_test_trans = np.array([_[0] for _ in y_test.values]) print(precision_score(y_test, preds, average='macro')) # 各类别分别计算,然后平均 print(precision_score(y_test, preds, average='micro')) # 全局,不区分类别 sklearn2pmml(pipeline, "iris_v2.pmml", with_repr=True) # sklearn2pmml(estimator=model, mapper=default_mapper, pmml='iris_v2.xml')
class WAF(object): def __init__(self): print('读取语料库:') seed_list, content_list = self.get_data( './data/豆瓣') # 文件格式:老无所依\t差评\t我不能因为它得了奥斯卡就说明它好看,我不能因。。。 print('\t' + '好评数:' + str(len(seed_list)) + ' 差评数:' + str(len(content_list))) seed_y = [0 for i in range(0, len(seed_list))] content_y = [1 for i in range(0, len(content_list))] queries = content_list + seed_list y = content_y + seed_y # 数据矢量化 self.vectorizer = TfidfVectorizer(tokenizer=self.get_ngrams) X = self.vectorizer.fit_transform(queries) print('向量化后维度:' + str(X.shape)) print('划分训练集、测试集...') # 使用 train_test_split 分割 X y 列表 # X_train矩阵的数目对应 y_train列表的数目(一一对应) -->> 用来训练模型 # X_test矩阵的数目对应 (一一对应) -->> 用来测试模型的准确性 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46) print('划分完成,训练集开始训练分类器...') #self.model = LogisticRegression() self.model = svm.SVC() #self.model=MultinomialNB(alpha=0.001) self.pipeline = PMMLPipeline([("classifier", self.model)]) self.pipeline.fit(X_train, y_train) joblib.dump(self.pipeline, "./result/classifier.pkl.z", compress=9) # compress压缩程度 print('训练完毕!!! 测试集开始预测结果...') predict = self.pipeline.predict(X_test) print("精度:{0:f}".format( metrics.precision_score(y_test, predict, average="weighted"))) print("召回:{0:f}".format( metrics.recall_score(y_test, predict, average="weighted"))) print("f1-score:{0:f}".format( metrics.f1_score(y_test, predict, average="weighted"))) print("预测完毕!!!!") print('***********************************************************') print('***********************************************************') def predict(self): new_pos, new_neg = self.get_data('./data/影评') # 格式与训练数据一样 new = new_pos + new_neg pos_y = [0 for i in range(0, len(new_pos))] neg_y = [1 for j in range(0, len(new_neg))] new_y = pos_y + neg_y X_predict = self.vectorizer.transform(new) print('新数据向量化后维度:' + str(X_predict.shape)) res = self.model.predict(X_predict) print("精度:{0:f}".format( metrics.precision_score(new_y, res, average="weighted"))) print("召回:{0:f}".format( metrics.recall_score(new_y, res, average="weighted"))) print("f1-score:{0:f}".format( metrics.f1_score(new_y, res, average="weighted"))) print("预测完毕!!!!") def get_data(self, path): f = open(path, 'r', encoding='utf8') pos = [] neg = [] for line in f.readlines(): text = line.strip().split('\t') if len(text) == 3 and text[1] == '好评': pos.append(text[2]) if len(text) == 3 and text[1] == '差评': neg.append(text[2]) return pos, neg def get_ngrams(self, query): tempQuery = str(query) ngrams = [] for i in range(0, len(tempQuery) - 2): ngrams.append(tempQuery[i:i + 2]) return ngrams
#vvv=mapper.fit_transform(heart_data[heart_data.columns.difference(["chd"])]) #print vvv ''' union = FeatureUnion([ ("first", mapper), ("second", mapper2) ]) ''' #tt= mapper2.fit_transform(heart_data[heart_data.columns.difference(["chd"])]) #print tt['famhist'] #用pipeline定义使用的模型,特征工程等 pipeline = PMMLPipeline([('mapper', mapper), ("classifier", GradientBoostingClassifier()) #("classifier",LogisticRegression()) ]) #vv=pipeline.fit(heart_data[heart_data.columns.difference(["chd"])]) vv = pipeline.fit(heart_data[heart_data.columns.difference(["chd"])], heart_data["chd"]) print vv #print pipeline.transform(heart_data[heart_data.columns.difference(["chd"])]) print pipeline.predict(heart_data[heart_data.columns.difference(["chd"])]) sklearn2pmml(pipeline, "lrHeart.pmml", with_repr=True, debug=True)
def PMML_creation(train_path, test_path, pmml_predictions, pmml_path,path,acct_id): logging.info('PMML creation Started.') data = pd.read_csv(r'' + train_path) data2 = pd.read_csv(r'' + test_path) features = ['avg_delay_categorical', 'variance_categorical', 'LMH_cumulative', 'avg_of_invoices_closed', 'avg_of_all_delays', 'payment_count_quarter_q1', 'payment_count_quarter_q2', 'payment_count_quarter_q3', 'payment_count_quarter_q4', 'invoice_count_quarter_q1', 'invoice_count_quarter_q2', 'invoice_count_quarter_q3', 'invoice_count_quarter_q4', 'number_invoices_closed'] #rf = RandomForestClassifier(n_estimators=100,random_state =42, class_weight = {0: 1, 1:1}, max_depth = 8, max_features =0.5) #duracell #rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=8,max_features=0.5, min_weight_fraction_leaf=0.1) #gettyimages # rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=8,max_features=0.5) #milliken #rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 1, 1: 2}, max_depth=8,max_features=0.5) #graybar # rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight={0: 1, 1: 1}, max_depth=7, # max_features=0.4, min_samples_split=4, min_samples_leaf=3, # min_weight_fraction_leaf=0.1) # rf = RandomForestClassifier(n_estimators=250, random_state=42, class_weight={0: 1, 1: 2}, max_depth=7, # max_features=0.4, min_samples_leaf=4, min_weight_fraction_leaf=0.2) # rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=10, random_state=42, class_weight={0: 1, 1: 2}, # # criterion='gini', max_depth=7, max_features=0.4, n_jobs=-1, # # min_weight_fraction_leaf=0.4) # rf = RandomForestClassifier(n_estimators=320, max_leaf_nodes=20, random_state=42, class_weight={0:1, 1:2}, # criterion='gini', max_depth=7, max_features=0.4,n_jobs=-1) # # rf = xgboost.XGBClassifier(random_state=42, n_estimators=206, min_samples_split=10, min_samples_leaf=6, # max_features='sqrt', max_depth=1, learning_rate=0.0015) # rf = xgboost.XGBClassifier(random_state=42, n_estimators=302, min_samples_split=10, min_samples_leaf=10, # max_features='sqrt', max_depth=1, learning_rate=0.0074) # rf=xgboost.XGBClassifier(random_state=42, n_estimators=145, min_samples_split=24, min_samples_leaf=1, # max_features='sqrt', max_depth=33, learning_rate=0.0077) # rf = RandomForestClassifier(random_state=42, n_estimators=400, min_weight_fraction_leaf=0.3, min_samples_split=24, # min_samples_leaf=1, max_features='sqrt', max_depth=1, criterion='entropy') #rf= xgboost.XGBClassifier(random_state=42,n_estimators=106,min_samples_split=24, min_samples_leaf= 16, max_features= 'auto',max_depth=1, learning_rate= 0.0044) #rf = LGBMClassifier(class_weight={0: 1, 1: 5}, max_depth=10, num_leaves=1000, min_data_in_leaf=500, # learning_rate=0.08) #rf = xgboost.XGBClassifier(random_state=42,n_estimators=106,min_samples_split=24, min_samples_leaf= 16, max_features= 'sqrt',max_depth=35, learning_rate= 0.0077) #rf= xgboost.XGBClassifier(random_state=42,n_estimators=400,min_samples_split=34, min_samples_leaf= 16, max_features= 'sqrt',max_depth=35, learning_rate= 0.0099) #rf= xgboost.XGBClassifier(random_state=42,n_estimators=200,min_samples_split=14, min_samples_leaf= 1, max_features= 'sqrt',max_depth=20, learning_rate= 0.0077) #final_report = pd.read_csv(path+'/account_'+acct_id+'/summary.csv') model = joblib.load(path+'/account_'+acct_id+'/trained_model/model.pkl') #model_name = str(model).split('(')[0] params = model.get_params() classifier= type(model)() rf = classifier.set_params(**params) print("-"*100) print(rf) #rf = type(model)(model.get_params) print((rf.get_params())) print("-"*100) print(model.get_params()) # rf= xgboost.XGBClassifier(random_state=42, n_estimators=320, min_samples_split=5, min_samples_leaf=6, # max_features='log2', max_depth=50, learning_rate=0.0093) # rf = xgboost.XGBClassifier(random_state=42, n_estimators=445, min_samples_split=5, min_samples_leaf=8, # max_features='sqrt', max_depth=1, learning_rate=0.00959591836734694) mapper = DataFrameMapper([('avg_delay_categorical', None), ('variance_categorical', None), ('LMH_cumulative', None), ('avg_of_invoices_closed', None), ('avg_of_all_delays', None), ('payment_count_quarter_q1', None), ('payment_count_quarter_q2', None), ('payment_count_quarter_q3', None), ('payment_count_quarter_q4', None), ('invoice_count_quarter_q1', None), ('invoice_count_quarter_q2', None), ('invoice_count_quarter_q3', None), ('invoice_count_quarter_q4', None), ('number_invoices_closed', None) ]) labels = data.loc[:, 'output'] labels.name = 'output' data = data[features].astype('double') print(data.dtypes) pipeline = PMMLPipeline([("mapper", mapper), ("estimator", rf)]) pickle_pipeline = Pipeline([("mapper", mapper), ("model", rf)]) pipeline.fit(data, labels) pickle_pipeline.fit(data, labels) predictions = pipeline.predict(data2[features]) predictions_prob = pipeline.predict_proba(data2[features]) data2['PMML_predictions'] = predictions for i in range(0, data2.shape[0]): data2.at[i, 'PMML_pred_proba_0'] = predictions_prob[i][0] data2.at[i, 'PMML_pred_proba_1'] = predictions_prob[i][1] data2.to_csv(pmml_predictions, index=False) sklearn2pmml(pipeline, r"" + pmml_path + '_PIPELINED' + ".pmml", debug=True) joblib.dump(pickle_pipeline, r"" + pmml_path + "_PIPELINED.pkl") logging.info('PMML created of size ' + str(file_size(r"" + pmml_path + ".pmml")))
pipeline.fit(train,train['labels']) #test = pd.read_csv(r'Data/UDM_DISPUTE_20171231-20180202.csv') test = pd.read_csv('validation.csv') test['main_output']=(test['FIN_PAID_AMT']>(0.01*test['FIN_ORIGINAL_AMT'])) test['labels']=test['main_output'].map({True:-1,False:1}) test = test.rename(columns={'ZZ_CLAIMDATE_SIMP_DT': 'customer_claim_date', 'CREATE_TIME': 'deduction_created_date','ZZ_XREF3': 'product_category','KUNWE': 'ship_to','FIN_ORIGINAL_AMT': 'original_dispute_amount','FIN_KUNNR': 'payer','FIN_PAID_AMT': 'paid_amount'}) from sklearn.linear_model import LogisticRegression test_result = pd.DataFrame() test_result['output'] = pipeline.predict(test.head(1790)) # test_result['predict_proba1'] = pipeline.predict_proba(test.head(1790))[:,0] test_result['predict_proba2'] = pipeline.predict_proba(test.head(1790))[:,1] test_result['actual_result'] = test['labels'].head(1790) from sklearn.metrics import classification_report print(classification_report(test_result['actual_result'],test_result['output'])) #pipeline.predict() # # test_real = pd.read_csv('test_real3.csv',encoding='latin') #
# Loading and reshaping data # Closing prices for Ethereum and Bitcoin are stored in separate CSV files df_bc = pd.read_csv("BTC-USD.csv", parse_dates=['Date']) df_eth = pd.read_csv("ETH-USD.csv", parse_dates=['Date']) bcv = df_bc.Close.values.reshape(-1, 1) etv = df_eth.Close.values.reshape(-1, 1) scaler_for_output = StandardScaler().fit(etv) scaler_for_prediction = StandardScaler() scaler_for_prediction.mean_ = -1.0 * scaler_for_output.mean_ / scaler_for_output.scale_ scaler_for_prediction.scale_ = 1 / scaler_for_output.scale_ pmml_pipe = PMMLPipeline(steps=[('data_transformer', StandardScaler()), ('model', linear_model.LinearRegression())], predict_transformer=scaler_for_prediction) # Training the model pmml_pipe.fit(bcv, scaler_for_output.transform(etv).reshape(-1, )) print('Learned parameters') print(pmml_pipe.steps[1][1].coef_) print(pmml_pipe.steps[1][1].intercept_) # Testing prediction print(pmml_pipe.predict([[1.0]])) print(pmml_pipe.predict_transform([[1.0]])) # Exporting Model to PMML sklearn2pmml(pmml_pipe, 'etherium_price_redict_model.xml', with_repr=True)
#!/usr/bin/env python # coding: utf-8 from sklearn.datasets import load_iris from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn2pmml import PMMLPipeline iris = load_iris() X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.25) y_train pipe_l = PMMLPipeline([('minmax', MinMaxScaler()), ('lr', LogisticRegression())]) model = pipe_l.fit(X_train,y_train) pipe_l.score(X_test, y_test) X_test[1] y_test[1] pipe_l.predict([[6.7, 3. , 5.2, 2.3]]) import pandas as pd irisd = pd.DataFrame(iris.data, columns=iris.feature_names) irisd.columns from nyoka import skl_to_pmml skl_to_pmml(pipe_l, irisd.columns, 'Target', "iris_nyoka_pipeline.pmml") from sklearn2pmml import sklearn2pmml sklearn2pmml(pipe_l, "iris_pipeline.pmml", with_repr = True, debug = True)
print(train_y.shape) print(train_x.shape) print( 'XGBClassifier------------------------------------------------------------' ) xg = PMMLPipeline([("classifier", xgb.XGBClassifier(max_depth=10, min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005)) # ubicomp2018第二名参数) ]) xg.fit(train_x, train_y) Prediction_RT = xg.predict(test_x) print(classification_report( test_y, Prediction_RT, digits=5, )) sklearn2pmml(xg, "xg2.pmml") plot_importance(xg, max_num_features=32) print(xg.feature_importances_) plt.show() plt.savefig('importance', dpi=600)
'KUNWE': 'ship_to', 'FIN_ORIGINAL_AMT': 'original_dispute_amount', 'FIN_KUNNR': 'payer', 'FIN_PAID_AMT': 'paid_amount' }) # test['ship_to'] = test['ship_to'].astype('str').str.split('.').str[0] # # test_transformations = pd.DataFrame(mapper.fit_transform(test),columns=['create_minus_claim_date', 'category_history', 'cal_cust_history', 'ZZ_CLAIMDATE_SIMP_DT_month', 'ship_to_history', 'original_with_avg_dispute', 'rank_xref_in_kunnr', 'b_value', 'rank_kunwe_in_kunnr']) # # test_transformations.to_csv('test_transformations.csv') test_result = pd.DataFrame() test_result['output'] = pipeline.predict(test) # test_result['predict_proba1'] = pipeline.predict_proba(test)[:, 0] test_result['predict_proba2'] = pipeline.predict_proba(test)[:, 1] test_result['actual_result'] = test['labels'] from sklearn.metrics import classification_report print( classification_report(test_result['actual_result'], test_result['output'])) from sklearn2pmml import sklearn2pmml # # #sklearn2pmml(pipeline, "only_b_value.pmml", user_classpath=[r"D:\jesus\sap\sklearn2pmml-plugin-1.0-SNAPSHOT.jar"],debug=True) #
# 数据拆分 X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( data, label, test_size=0.25, shuffle=True) # nn nn = PMMLPipeline([("classifier", sk_nn.MLPClassifier(activation='tanh', solver='adam', alpha=0.0001, learning_rate='adaptive', learning_rate_init=0.001, max_iter=1000))]) # 模型拟合 nn.fit(data, label) # 模型预测 nn_predict = nn.predict(X_test) # 模型评估 # 基础打分 nn_score = nn.score(X_test, y_test) print(nn_score) # 交叉验证 nn_cross1 = cross_val_score(nn, X_train, y_train, scoring='accuracy', cv=10, n_jobs=-1) nn_cross2 = cross_val_score(nn, X_test, y_test, scoring='accuracy',