def main(): corpus, labels = get_data() # 获取数据集 print("总的数据量:", len(labels)) corpus, labels = remove_empty_docs(corpus, labels) # print('样本之一:', corpus[10]) # print('样本的label:', labels[10]) # label_name_map = ["垃圾邮件", "正常邮件"] # print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) #labels[0:4999]为1.0,labels[5000:10001]为0.0 # print('实际类型:', label_name_map[1], label_name_map[0]) # 对数据进行划分 train_corpus, test_corpus, train_labels, test_labels = prepare_datasets( corpus, labels, test_data_proportion=0.3) #对数据进行规整化和预处理 from normalization import normalize_corpus # 进行归一化 norm_train_corpus = normalize_corpus(train_corpus) # print(norm_train_corpus[:3]) norm_test_corpus = [ '中信(国际)电子科技有限公司推出新产品:升职步步高、做生意发大财、连找情人都用的上,详情进入网址httpwwwusa5588comccc电话:02033770208服务热线:013650852999', '向专利局递交申请需要将文件转为PDF格式。我已经将说明书、说明书附图、权利要求书、摘要转化为PDF格式。由于WORED文档转化为PDF文档时公式和变量容易变形,而这种错误在申请递交给专利局之后将无法弥补,所以,请你逐字对照检查,确保PDF文件中没有变形错误,尤其是变量的上标、下标、运算符。' ] # norm_test_corpus = normalize_corpus(test_corpus) # print(norm_test_corpus) from feature_extractors import bow_extractor, tfidf_extractor import gensim import jieba # 词袋模型特征 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) """ bow_train_features: (0, 173) 1 第0个列表元素,**词典中索引为173的元素**, 词频 (0, 54) 1 (0, 4) 1 """ # bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf 特征 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # 训练分类器 from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression mnb = MultinomialNB() # 朴素贝叶斯 svm = SGDClassifier(loss='hinge', n_iter=100) # 支持向量机 lr = LogisticRegression() # 逻辑回归 print("基于tfidf的支持向量机模型") svm_tfidf_predictions = train_predict_evaluate_model( classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) print(svm_tfidf_predictions)
'i love blue cheese' ] # We use new_doc as our test dataset new_doc = ['loving this blue sky today'] import pandas as pd def display_features(features, feature_names): df = pd.DataFrame(data=features, columns=feature_names) print(df) # We pass our CORPUS to the simplest bow extractor we created from feature_extractors import bow_extractor bow_vectorizer, bow_features = bow_extractor(CORPUS) features = bow_features.todense() # Since we can't view the default 'sparse matrix' print(features) # Remember, we always need to extract the same features from our test data too! new_doc_features = bow_vectorizer.transform(new_doc) new_doc_features = new_doc_features.todense() print(new_doc_features) # Let's see which words/tokens these counts are for... feature_names = bow_vectorizer.get_feature_names() print(feature_names) # Let's print both the feature names and counts together # - first for the training data and then for the test data display_features(features, feature_names)
def main(): corpus, labels = get_data() print("total data size:", len(labels)) corpus, labels = remove_empty_docs(corpus, labels) print("sample:", corpus[10]) print("label of sample:", labels[10]) label_name_map = ['spam', 'normal'] # 0代表spam,1代表normal print("actual type:", label_name_map[int(labels[10])]) # 划分数据集 train_corpus, train_labels, test_corpus, test_labels = prepare_datasets( corpus, labels) # 对语料进行预处理 norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) # 词袋模型 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf模型 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # 对处理后的语料进行分词 tokenized_train = [jieba.lcut(text) for text in norm_train_corpus] tokenized_test = [jieba.lcut(text) for text in norm_test_corpus] # 词向量Word2Vec model = gensim.models.Word2Vec(tokenized_train, size=500, window=100, min_count=30, sample=1e-3) # 分别以多项分布朴素贝叶斯、SVM、逻辑回归算法训练分类器并评估各个分类器性能 mnb = MultinomialNB() # 朴素贝叶斯 svm = SGDClassifier() # SVM lr = LogisticRegression() # 逻辑斯特回归 print("\nNavie Bayes based on BOW") mnb_bow_predictions = train_predict_evaluate_model( classifier=mnb, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) print("\nLogistic Regression based on BOW") lr_bow_predictions = train_predict_evaluate_model( classifier=lr, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) print("\nSVM based on BOW") svm_bow_predictions = train_predict_evaluate_model( classifier=svm, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) print("\nNavie Bayes based on tfidf") mnb_tfidf_predictions = train_predict_evaluate_model( classifier=mnb, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) print("\nLogistic Regression based on tfidf") lr_tfidf_predictions = train_predict_evaluate_model( classifier=lr, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) print("\nSVM based on tfidf") svm_tfidf_predictions = train_predict_evaluate_model( classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels)
def main(): label_name_map = ["垃圾邮件", "正常邮件"] # 对数据进行划分 train_corpus, test_corpus, train_labels, test_labels = data_preprocess() # 标准化,去除特殊字符 norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) # 词袋模型特征 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf 特征 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # tokenize documents # tokenized_train = [jieba.lcut(text) for text in norm_train_corpus] # tokenized_test = [jieba.lcut(text) for text in norm_test_corpus] # build word2vec 模型 # logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO) # model = gensim.models.Word2Vec(tokenized_train, # size=500, # window=100, # min_count=30, # sample=1e-3) # model.save("./vector.model") # model=gensim.models.Word2Vec.load("./vector.model") # print("已加载词向量模型....") from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression mnb = MultinomialNB() svm = SGDClassifier(loss='hinge', n_iter=100) lr = LogisticRegression() # 基于词袋模型的多项朴素贝叶斯 print("基于词袋模型特征的贝叶斯分类器") mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型特征的逻辑回归 print("基于词袋模型特征的逻辑回归") lr_bow_predictions = train_predict_evaluate_model(classifier=lr, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型的支持向量机方法 print("基于词袋模型的支持向量机") svm_bow_predictions = train_predict_evaluate_model(classifier=svm, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于tfidf的多项式朴素贝叶斯模型 print("基于tfidf的贝叶斯模型") mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的逻辑回归模型 print("基于tfidf的逻辑回归模型") lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的支持向量机模型 print("基于tfidf的支持向量机模型") svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) #取出一部分正确分类的样本和一部分错误分类的样本 import re num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 0 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 1 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break
from normalization import normalize_corpus norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) ''.strip() from feature_extractors import bow_extractor, tfidf_extractor from feature_extractors import averaged_word_vectorizer from feature_extractors import tfidf_weighted_averaged_word_vectorizer import nltk import gensim # bag of words features bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf features tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # tokenize documents tokenized_train = [nltk.word_tokenize(text) for text in norm_train_corpus] tokenized_test = [nltk.word_tokenize(text) for text in norm_test_corpus] # build word2vec model model = gensim.models.Word2Vec(tokenized_train, size=500, window=100, min_count=30, sample=1e-3)
def main(): corpus, labels = get_data() # 获取数据集 print("总的数据量:", len(labels)) corpus, labels = remove_empty_docs(corpus, labels) print('样本之一:', corpus[10]) print('样本的label:', labels[10]) label_name_map = ["垃圾邮件", "正常邮件"] print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) # 对数据进行划分 train_corpus, test_corpus, train_labels, test_labels = prepare_datasets( corpus, labels, test_data_proportion=0.3) from normalization import normalize_corpus # 进行归一化 norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) ''.strip() from feature_extractors import bow_extractor, tfidf_extractor import gensim import jieba # 词袋模型特征 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf 特征 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # tokenize documents tokenized_train = [jieba.lcut(text) for text in norm_train_corpus] print(tokenized_train[2:10]) tokenized_test = [jieba.lcut(text) for text in norm_test_corpus] # build word2vec 模型 model = gensim.models.Word2Vec(tokenized_train, size=500, window=100, min_count=30, sample=1e-3) from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression mnb = MultinomialNB() svm = SGDClassifier(loss='hinge', n_iter=100) lr = LogisticRegression() # 基于词袋模型的多项朴素贝叶斯 print("基于词袋模型特征的贝叶斯分类器") mnb_bow_predictions = train_predict_evaluate_model( classifier=mnb, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型特征的逻辑回归 print("基于词袋模型特征的逻辑回归") lr_bow_predictions = train_predict_evaluate_model( classifier=lr, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型的支持向量机方法 print("基于词袋模型的支持向量机") svm_bow_predictions = train_predict_evaluate_model( classifier=svm, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于tfidf的多项式朴素贝叶斯模型 print("基于tfidf的贝叶斯模型") mnb_tfidf_predictions = train_predict_evaluate_model( classifier=mnb, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的逻辑回归模型 print("基于tfidf的逻辑回归模型") lr_tfidf_predictions = train_predict_evaluate_model( classifier=lr, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的支持向量机模型 print("基于tfidf的支持向量机模型") svm_tfidf_predictions = train_predict_evaluate_model( classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) import re num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 0 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 1 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break
pre_process_corpus = np.vectorize(pre_process_document) train_headlines = pre_process_corpus(train_headlines) #val_reviews = pre_process_corpus(val_reviews) test_headlines = pre_process_corpus(test_headlines) #feature_extraction from feature_extractors import bow_extractor, tfidf_extractor from feature_extractors import averaged_word_vectorizer from feature_extractors import tfidf_weighted_averaged_word_vectorizer import nltk import gensim # bag of words features bow_vectorizer, bow_train_features = bow_extractor(train_headlines) bow_test_features = bow_vectorizer.transform(test_headlines) # tfidf features tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_headlines) tfidf_test_features = tfidf_vectorizer.transform(test_headlines) # tokenize documents tokenized_train = [nltk.word_tokenize(text) for text in train_headlines] tokenized_test = [nltk.word_tokenize(text) for text in test_headlines] # build word2vec model model = gensim.models.Word2Vec(tokenized_train, size=500,
def main(): corpus, labels = get_data() #获取数据集 print('总的数据量:', len(corpus)) print('labels数据量:', len(labels)) corpus, labels = remove_empty_docs(corpus, labels) print('样本之一:', corpus[0]) print('样本的label:', labels[243]) label_name_map = ['垃圾邮件', '正常邮件'] print('实际:', label_name_map[int(labels[10])], label_name_map[int(labels[8908])]) #对数据进行划分 train_corpus, test_corpus, train_labels, test_labels = prepare_datasets( corpus, labels, test_data_proportion=0.3) print('训练数据量:', len(train_corpus)) print('测试数据量:', len(test_corpus)) from normalization import normalize_corpus #对数据归一化 norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) ''.strip() from feature_extractors import bow_extractor, tfidf_extractor import gensim import jieba #词袋模型特征 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) #tfidf 特征 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # tokenize documents tokenized_train = [jieba.lcut(text) for text in norm_train_corpus] print(tokenized_train[2:10]) tokenized_test = [jieba.lcut(text) for text in norm_test_corpus] #训练分类器 from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression mnb = MultinomialNB() # svm = SGDClassifier(loss='hinge', n_iter=100) svm = SGDClassifier(loss='hinge') lr = LogisticRegression() #基于词袋模型的多项朴素贝叶斯 print('基于词袋模型特征的贝叶斯分类器') mnb_bow_predictions = train_predict_evaluate_model( classifier=mnb, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型的逻辑回归 print("基于词袋模型特征的逻辑回归") lr_bow_predictions = train_predict_evaluate_model( classifier=lr, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型的支持向量机 print('基于词袋模型特征的支持向量机') svm_bow_predictions = train_predict_evaluate_model( classifier=svm, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于tfidf的多项式朴素贝叶斯模型 print('基于tfidf的多项式朴素贝叶斯模型') mnb_tfidf_predictions = train_predict_evaluate_model( classifier=mnb, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的逻辑回归模型 print('基于tfidf的逻辑回归模型') lr_tfidf_predictions = train_predict_evaluate_model( classifier=lr, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的支持向量机模型 print('基于tfidf支持向量机模型') svm_tfidf_predictions = train_predict_evaluate_model( classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) #显示部分正确归类和部分错误归类 import re num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 0 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break if label == 1 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break
'i love blue cheese' ] new_doc = ['loving this blue sky today'] import pandas as pd def display_features(features, feature_names): df = pd.DataFrame(data=features, columns=feature_names) print df from feature_extractors import bow_extractor bow_vectorizer, bow_features = bow_extractor(CORPUS) features = bow_features.todense() print features new_doc_features = bow_vectorizer.transform(new_doc) new_doc_features = new_doc_features.todense() print new_doc_features feature_names = bow_vectorizer.get_feature_names() print feature_names display_features(features, feature_names) display_features(new_doc_features, feature_names) import numpy as np
def conver2BOW(data): new_data = [] for q in data: new_data.append(q) bow_vectorizer, bow_X = bow_extractor(new_data) return bow_vectorizer, bow_X
from normalization import normalize_corpus norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) ''.strip() from feature_extractors import bow_extractor, tfidf_extractor from feature_extractors import averaged_word_vectorizer from feature_extractors import tfidf_weighted_averaged_word_vectorizer import nltk import gensim # bag of words features bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf features tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # tokenize documents tokenized_train = [nltk.word_tokenize(text) for text in norm_train_corpus] tokenized_test = [nltk.word_tokenize(text) for text in norm_test_corpus] # build word2vec model model = gensim.models.Word2Vec(tokenized_train, size=500,
def main(): # 获取数据集 corpus, labels = get_data() print("总的数据量:", len(labels)) # 删除无用数据,也就是空数据 corpus, labels = remove_empty_docs(corpus, labels) print('样本之一:', corpus[10]) print('样本对应的label:', labels[10]) label_name_map = ["垃圾邮件", "正常邮件"] # 下标为0 的是垃圾邮件,为1 的是正常邮件 print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) # 对数据进行划分 train_corpus, test_corpus, train_labels, test_labels = prepare_datasets( corpus, labels, test_data_proportion=0.3) # 进行归一化 # 现在数据的个数 # 第二个参数传入为True表示是否用分词信息 norm_train_corpus = normalize_corpus(train_corpus) norm_test_corpus = normalize_corpus(test_corpus) print(norm_train_corpus[11]) print("==========数据处理完成==========") # 词袋模型特征 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf 特征 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # jieba分词 tokenized_train = [jieba.lcut(text) for text in norm_train_corpus] print(tokenized_train[2:10]) tokenized_test = [jieba.lcut(text) for text in norm_test_corpus] # build word2vec 模型 model = gensim.models.Word2Vec(tokenized_train, size=500, window=100, min_count=30, sample=1e-3) # 朴树贝叶斯分类器 mnb = MultinomialNB() # svm分类器 svm = SGDClassifier(loss='hinge', n_iter=100) # lr分类器 lr = LogisticRegression() # 两种数据处理方式,三种分类模型 # 基于词袋模型的多项朴素贝叶斯 print("基于词袋模型特征的贝叶斯分类器") mnb_bow_predictions = train_predict_evaluate_model( classifier=mnb, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型特征的逻辑回归 print("基于词袋模型特征的逻辑回归") lr_bow_predictions = train_predict_evaluate_model( classifier=lr, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型的支持向量机方法 print("基于词袋模型的支持向量机") svm_bow_predictions = train_predict_evaluate_model( classifier=svm, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于tfidf的多项式朴素贝叶斯模型 print("基于tfidf的贝叶斯模型") mnb_tfidf_predictions = train_predict_evaluate_model( classifier=mnb, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的逻辑回归模型 print("基于tfidf的逻辑回归模型") lr_tfidf_predictions = train_predict_evaluate_model( classifier=lr, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的支持向量机模型 print("基于tfidf的支持向量机模型") svm_tfidf_predictions = train_predict_evaluate_model( classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) import re num = 0 # 用户问句,该问句的标签,该问句的预测结果 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): # 垃圾邮件 if label == 0 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('原始文本:') print(document) num += 1 if num == 4: break num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 1 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('原始文本:') print(document) num += 1 if num == 4: break
def main(): corpus, labels = get_data() # 获取数据集 print("总的数据量:", len(labels)) corpus, labels = remove_empty_docs(corpus, labels) print('样本之一:', corpus[10]) print('样本的label:', labels[10]) label_name_map = ["垃圾邮件", "正常邮件"] print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) #labels[0:4999]为1.0,labels[5000:10001]为0.0 print('实际类型:', label_name_map[1], label_name_map[0]) # 对数据进行划分 train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus, labels, test_data_proportion=0.3) #对数据进行规整化和预处理 from normalization import normalize_corpus # 进行归一化 norm_train_corpus = normalize_corpus(train_corpus) # print(norm_train_corpus[:3]) norm_test_corpus = normalize_corpus(test_corpus) # print(norm_test_corpus) # norm_test_corpus1 = ['中信(国际)电子科技有限公司推出新产品:升职步步高、做生意发大财、连找情人都用的上,详情进入网址httpwwwusa5588comccc电话:02033770208服务热线:013650852999'] # ''.strip() from feature_extractors import bow_extractor, tfidf_extractor import gensim import jieba # 词袋模型特征 bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus) # print(bow_vectorizer) # print(bow_train_features) bow_test_features = bow_vectorizer.transform(norm_test_corpus) # tfidf 特征 tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus) tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) # tokenize documents tokenized_train = [jieba.lcut(text) for text in norm_train_corpus] print(tokenized_train[2:10]) tokenized_test = [jieba.lcut(text) for text in norm_test_corpus] # build word2vec 模型 model = gensim.models.Word2Vec(tokenized_train, size=500, window=100, min_count=30, sample=1e-3) #训练分类器 from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LogisticRegression mnb = MultinomialNB() #朴素贝叶斯 svm = SGDClassifier(loss='hinge', n_iter=100) #支持向量机 lr = LogisticRegression() #逻辑回归 # 基于词袋模型的多项朴素贝叶斯 print("基于词袋模型特征的贝叶斯分类器") mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # print(mnb_bow_predictions) #返回的预测结果:[0. 0. 1. ... 0. 1. 0.] # 基于词袋模型特征的逻辑回归 print("基于词袋模型特征的逻辑回归") lr_bow_predictions = train_predict_evaluate_model(classifier=lr, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于词袋模型的支持向量机方法 print("基于词袋模型的支持向量机") svm_bow_predictions = train_predict_evaluate_model(classifier=svm, train_features=bow_train_features, train_labels=train_labels, test_features=bow_test_features, test_labels=test_labels) # 基于tfidf的多项式朴素贝叶斯模型 print("基于tfidf的贝叶斯模型") mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的逻辑回归模型 print("基于tfidf的逻辑回归模型") lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) # 基于tfidf的支持向量机模型 print("基于tfidf的支持向量机模型") svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm, train_features=tfidf_train_features, train_labels=train_labels, test_features=tfidf_test_features, test_labels=test_labels) import re num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 0 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break #部分分错邮件 print("部分分错邮件:") num = 0 for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions): if label == 1 and predicted_label == 0: print('邮件类型:', label_name_map[int(label)]) print('预测的邮件类型:', label_name_map[int(predicted_label)]) print('文本:-') print(re.sub('\n', ' ', document)) num += 1 if num == 4: break