def predict(name, target): stopwords = helper.read_stopwords() features = to_sample(target, stopwords) classifier = helper.load(name + "_model.sav") result = classifier.predict([features]) le: LabelEncoder = helper.load(name + "_label.sav") result = le.inverse_transform(result) print(result)
def train(name): stopwords = helper.read_stopwords() (feature_list, label_list) = read_dataset(name, stopwords) le = LabelEncoder() train_label = le.fit_transform(label_list) helper.dump(le, name + "_label.sav") classifier = Pipeline([('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(feature_list, train_label) helper.dump(classifier, name + "_model.sav") score = classifier.score(feature_list, train_label) print(score)
def create_feature(name): time1 = DateUtils.current_time_millis() feature_list, label_list = read_dataset(name) time2 = DateUtils.current_time_millis() print("LOAD COMPLETE:{}ms".format(time2 - time1)) feature_list = np.array(feature_list) stop_words = helper.read_stopwords() union = FeatureUnion( transformer_list=[ ("feature", Pipeline([('selector', ItemSelector(1)), ("dvec", DictVectorizer(sparse=False))])), ( "content", Pipeline([ ('selector', ItemSelector(0)), ( 'cvec', CountVectorizer( # analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1, stop_words=stop_words)), ('tfidf', TfidfTransformer()) ])) ], transformer_weights={ "feature": 1.0, "content": 1.0 }) feature_list = union.fit_transform(feature_list) time1 = DateUtils.current_time_millis() print("TRANSFORM COMPLETE:{}ms".format(time1 - time2)) # dvec: CountVectorizer = union.transformer_list[0][1].named_steps["dvec"] # helper.dump(dvec, name + "_dvec.sav") # cvec: CountVectorizer = union.transformer_list[1][1].named_steps["cvec"] # helper.dump(cvec, name + "_cvec.sav") helper.dump(union, name + "_vec.sav") time2 = DateUtils.current_time_millis() print("DUMP VECTOR COMPLETE:{}ms".format(time2 - time1)) helper.dump(feature_list, name + "_data.sav") helper.dump(label_list, name + "_label.sav") time1 = DateUtils.current_time_millis() print("DUMP FEATURE COMPLETE:{}ms".format(time1 - time2))
def train(name): stopwords = helper.read_stopwords() (feature_list, label_list, headers) = read_dataset(name, stopwords) feature_dict = transform(feature_list, headers) le = LabelEncoder() dummy_y = le.fit_transform(label_list) helper.dump(le, name + "_label.sav") # 流水线学习器 classifier = Pipeline([ # 并行处理 ('union', FeatureUnion( transformer_list=[ ('故障现象', Pipeline([ ('selector', ItemSelector(key='故障现象')), # 词频特征提取 ('tfidf', TfidfVectorizer(analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1)), ])), ('原因分析', Pipeline([ ('selector', ItemSelector(key='原因分析')), ('tfidf', TfidfVectorizer(analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1)), ])), ('处理意见及结果', Pipeline([ ('selector', ItemSelector(key='处理意见及结果')), ('tfidf', TfidfVectorizer(analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1)), ])), ], transformer_weights={ '故障现象': 2.0, '原因分析': 1.5, '处理意见及结果': 1.0, }, )), # ('svc', SVC(kernel='linear')), ('RFC', RandomForestClassifier()) ]) classifier.fit(feature_dict, dummy_y) helper.dump(classifier, name + "_model.sav") score = classifier.score(feature_dict, dummy_y) print(score) # 1.0 PS: 样本太少
"""保存一般模型""" import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer from sklearn.pipeline import Pipeline, FeatureUnion from piz_ml.skl import helper from piz_ml.skl.Test04 import read_dataset, ItemSelector if __name__ == "__main__": (feature_list, label_list) = read_dataset("sample04_test") feature_list = np.array(feature_list) stop_words = helper.read_stopwords() # feature_list = feature_list[:, 0] # union = FeatureUnion( transformer_list=[ ("feature", Pipeline([('selector', ItemSelector(1)), ("vec", DictVectorizer(sparse=False))])), ( "content", Pipeline([ ('selector', ItemSelector(0)), ( 'cvec', CountVectorizer( # analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1, stop_words=stop_words)),