def build_sentiment(classifier, name, with_proba=True, **pmml_options): pipeline = PMMLPipeline([ ("tf-idf", TfidfVectorizer( analyzer="word", preprocessor=None, strip_accents=None, lowercase=True, token_pattern=None, tokenizer=Splitter(), stop_words="english", ngram_range=(1, 2), norm=None, dtype=(numpy.float32 if isinstance( classifier, RandomForestClassifier) else numpy.float64))), ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name + ".pkl") score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns=["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis=1) store_csv(score, name + ".csv")
def test_call(self): splitter = Splitter() self.assertEqual((), splitter("")) self.assertEqual((), splitter(".")) self.assertEqual(("one", ), splitter("one")) self.assertEqual(("++one", ), splitter("++one")) self.assertEqual(("one++", ), splitter("one++")) self.assertEqual(("one", ), splitter("--one")) self.assertEqual(("one", ), splitter("one--")) self.assertEqual(("one", "two", "three"), splitter("one two three")) self.assertEqual(("one", "t,w.o", "three"), splitter(",one _t,w.o_ three."))
("densifier", DenseTransformer()), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pmml(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name) if "Sentiment" in datasets: pmml_textindex_args = dict(analyzer = "word", preprocessor = None, strip_accents = None, dtype = numpy.float64) build_sentiment(LinearDiscriminantAnalysis(), TfidfVectorizer(tokenizer = Splitter(), ngram_range = (1, 3), norm = None, **pmml_textindex_args), "LinearDiscriminantAnalysisSentiment") build_sentiment(LinearSVC(random_state = 13), CountVectorizer(tokenizer = Splitter(), ngram_range = (1, 2), **pmml_textindex_args), "LinearSVCSentiment", with_proba = False) build_sentiment(LogisticRegression(multi_class = "ovr"), TfidfVectorizer(stop_words = "english", tokenizer = Matcher(), ngram_range = (1, 3), binary = True, norm = None, **pmml_textindex_args), "LogisticRegressionSentiment") build_sentiment(RandomForestClassifier(max_depth = 8, min_samples_leaf = 10, n_estimators = 31, random_state = 13), CountVectorizer(ngram_range = (1, 2), **pmml_textindex_args), "RandomForestSentiment") build_sentiment(XGBClassifier(objective = "binary:logistic", ntree_limit = 31, random_state = 13), CountVectorizer(tokenizer = Matcher(), **pmml_textindex_args), "XGBoostSentiment") # # Multi-class classification # def load_iris(name): df = load_csv(name) return split_csv(df) iris_X, iris_y = load_iris("Iris")
n = typicalNDict[name] return group.sample(n=n) major = pd.read_csv(r"C:\Users\钟顺民\Desktop\6.csv", sep=",", encoding='ISO-8859-1') \ .dropna().groupby('id', as_index=False, group_keys=False) \ .apply(typicalsamling, typicalNDict_Major) # 分配数据 X = major.drop(['id'], axis=1) Y = major["id"] print("data done!") pipeline = PMMLPipeline([ ('mapper', DataFrameMapper([ ('name', TfidfVectorizer(norm=None, analyzer="word", max_features=500, tokenizer=Splitter())), ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=1000, tokenizer=Splitter())) ])), ('model', SVC(max_iter=10000)), # train on TF-IDF vectors w/ Linear SVM classifier ]) print("model set done!") pipeline.fit(X, Y) print("model fit done!") c = pd.read_csv(r"C:\Users\钟顺民\Desktop\6.csv", sep=',', encoding='ISO-8859-1').dropna().sample(n=200) prediction = pipeline.predict(c.drop(['id'], axis=1)) t = c['id'] print("Accuracy Score ->", accuracy_score(prediction, t) * 100) """ Accuracy Score -> 98.5 """
("count", WordCountTransformer()) ])), ("selector", SelectKBest(f_classif, k = 1000)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pkl(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba == True: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name) if "Sentiment" in datasets: build_sentiment(LinearSVC(random_state = 13), Splitter(), "LinearSVCSentiment", with_proba = False) build_sentiment(LogisticRegressionCV(cv = 3), None, "LogisticRegressionSentiment") build_sentiment(RandomForestClassifier(n_estimators = 10, min_samples_leaf = 3, random_state = 13), Matcher(), "RandomForestSentiment", compact = False) # # Regression # auto_X, auto_y = load_auto("Auto") auto_X["cylinders"] = auto_X["cylinders"].astype(int) auto_X["model_year"] = auto_X["model_year"].astype(int) auto_X["origin"] = auto_X["origin"].astype(int) def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = {
data = pd.read_csv(r"../data/major_sample.csv", encoding='ISO-8859-1') return data if __name__ == '__main__': samples = get_major_data() X = samples.drop(['id'], axis=1) Y = samples["id"] pipeline = PMMLPipeline([ ('mapper', DataFrameMapper([('name', TfidfVectorizer(norm=None, analyzer="word", max_features=1000, tokenizer=Splitter())), ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=1000, tokenizer=Splitter()))])), ('model', SVC(max_iter=10000) ), # train on TF-IDF vectors w/ Linear SVM classifier ]) print("model set done!") pipeline.fit(X, Y) print("model fit done!") c = pd.read_csv(r"../data/klarna 2.csv", encoding='ISO-8859-1').sample(n=200)
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn2pmml.feature_extraction.text import Splitter from sklearn.cluster import KMeans from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml import sklearn2pmml # 构建pipeline pipeline = PMMLPipeline([("td_vector", TfidfVectorizer(max_df=0.7, min_df=0.01, tokenizer=Splitter(), norm=None)), ("km", KMeans(n_clusters=100, random_state=1000))]) # 注意:PMMLPipeline的最后一个必须是评估器;TfidfVectorizer不能使用norm,而且分词器需要使用Splitter() # 训练模型 sentences为空格分词的句子或者文件 pipeline.fit(sentences) # 保存pipeline模型 sklearn2pmml(pipeline, "hzd.pmml") # 预测结果 print(pipeline.predict(sentences))
def test_pickle(self): splitter = Splitter("\W") self.assertEqual("\W", splitter.separator_re) splitter_clone = SplitterTest._clone(splitter) self.assertEqual("\W", splitter_clone.separator_re)
from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper binary = False data = pd.read_csv("test/support/mpg.csv") if binary: data["drv"] = data["drv"].replace("r", "4") numeric_features = ["displ", "year", "cyl"] categorical_features = ["class"] text_features = [] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter())]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LGBMClassifier(n_estimators=1000))]) pipeline.fit(data, data["drv"], model__categorical_feature=[3]) suffix = "binary" if binary else "multiclass" sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml") print(pipeline.predict(data[:10]))
from common import * from sklearn2pmml.feature_extraction.text import Matcher, Splitter sentiment_X, sentiment_y = load_sentiment("Sentiment") stop_words = [ "a", "and", "are", "d", "i", "is", "it", "ll", "m", "s", "the", "ve", "we", "you" ] def tokenize(tokenizer, name): def process(line): tokens = tokenizer(line.lower()) tokens = [token for token in tokens if token not in stop_words] return "\t".join(tokens) sentiment_processed_X = sentiment_X.apply(process) store_csv(sentiment_processed_X, name) tokenize(Matcher("(?u)\\b\\w\\w+\\b"), "CountVectorizerSentiment") tokenize(Matcher("\\w+"), "MatcherSentiment") tokenize(Splitter("\\s+"), "SplitterSentiment")
from sklearn.feature_extraction.text import CountVectorizer from lightgbm import LGBMRegressor from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import PMMLLabelEncoder from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["drv", "class"] text_features = ["model"] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] + [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for f in text_features] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("model", LGBMRegressor(n_estimators=1000)) ]) pipeline.fit(data, data["hwy"], model__categorical_feature=[3, 4]) sklearn2pmml(pipeline, "test/support/python/lightgbm_regression.pmml") print(pipeline.predict(data[:10]))
("densifier", DenseTransformer()), ("selector", SelectKBest(f_classif, k = 500)), ("classifier", classifier) ]) pipeline.fit(sentiment_X, sentiment_y) pipeline.configure(**pmml_options) store_pmml(pipeline, name) score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"]) if with_proba: score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"]) score = pandas.concat((score, score_proba), axis = 1) store_csv(score, name) if "Sentiment" in datasets: pmml_textindex_args = dict(analyzer = "word", preprocessor = None, strip_accents = None, dtype = numpy.float64) build_sentiment(LinearSVC(random_state = 13), CountVectorizer(tokenizer = Splitter(), ngram_range = (1, 2), **pmml_textindex_args), "LinearSVCSentiment", with_proba = False) build_sentiment(LogisticRegression(multi_class = "ovr"), TfidfVectorizer(stop_words = "english", tokenizer = Matcher(), ngram_range = (1, 3), norm = None, **pmml_textindex_args), "LogisticRegressionSentiment") build_sentiment(RandomForestClassifier(max_depth = 8, min_samples_leaf = 10, n_estimators = 31, random_state = 13), CountVectorizer(ngram_range = (1, 2), **pmml_textindex_args), "RandomForestSentiment") build_sentiment(XGBClassifier(objective = "binary:logistic", ntree_limit = 31, random_state = 13), CountVectorizer(tokenizer = Matcher(), **pmml_textindex_args), "XGBoostSentiment") # # Multi-class classification # def load_iris(name): df = load_csv(name) return split_csv(df) iris_X, iris_y = load_iris("Iris") def build_iris(classifier, name, **pmml_options):
import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LinearRegression from sklearn2pmml import sklearn2pmml from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.feature_extraction.text import Splitter from sklearn_pandas import DataFrameMapper data = pd.read_csv("test/support/mpg.csv") numeric_features = ["displ", "year", "cyl"] categorical_features = ["drv", "class"] text_features = ["model"] mapper = DataFrameMapper( [(numeric_features, [ContinuousDomain()])] + [([f], [CategoricalDomain(), OneHotEncoder()]) for f in categorical_features] + [(f, [ CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5) ]) for f in text_features]) pipeline = PMMLPipeline([("mapper", mapper), ("model", LinearRegression())]) pipeline.fit(data, data["hwy"]) sklearn2pmml(pipeline, "test/support/python/linear_regression_text.pmml") print(list(pipeline.predict(data[:10])))