def create_word_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet' posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def predDataSentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir): reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType preResStorePath=desDir+'/'+reviewDataSetName+'ClassPro.txt' preTagStorePath=desDir+'/'+reviewDataSetName+'ClassTag.txt' start=time.clock() sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') #提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl' #装载分类器 clf = pickle.load(open(classifierPath)) #分类之预测数据类标签 data_tag=clf.batch_classify(review_feature) p_file = open(preTagStorePath, 'w') for i in data_tag: p_file.write(str(i)+ '\n') p_file.close() #分类之预测数据积极、消极可能性 pred = clf.batch_prob_classify(review_feature) # 记录分类结果 积极可能性 消极可能性 p_file = open(preResStorePath, 'w') reviewCount = 0 for i in pred: reviewCount += 1 p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n') p_file.close() end=time.clock() return reviewCount,end-start
def predictDataSentimentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir): reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType oriDataPath=desDir+'/'+reviewDataSetName+'OriData.txt' oriDataFeaPath = desDir + '/' + reviewDataSetName + 'OriFea.txt' preResStorePath=desDir+'/'+reviewDataSetName+'ClassPro.txt' preTagStorePath=desDir+'/'+reviewDataSetName+'ClassTag.txt' start=time.clock() #reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx' #reviewDataSetPath='D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet/pos_review.xlsx' review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据 #将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') #提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath='D:/ReviewHelpfulnessPrediction\BuildedClassifier/'+str(best_classifier)[0:15]+'.pkl' #装载分类器 clf = pickle.load(open(classifierPath)) #分类之预测数据类标签 data_tag=clf.batch_classify(review_feature) p_file = open(preTagStorePath, 'w') for i in data_tag: p_file.write(str(i)+ '\n') p_file.close() #分类之预测数据积极、消极可能性 pred = clf.batch_prob_classify(review_feature) # 记录分类结果 积极可能性 消极可能性 p_file = open(preResStorePath, 'w') reviewCount = 0 for i in pred: reviewCount += 1 p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n') p_file.close() # 记录原始数据 p_file = open(oriDataPath, 'w') for d in review: p_file.write(d.encode('utf-8')+'\n') p_file.close() p_file = open(oriDataFeaPath, 'w') # 记录原始数据特征提取结果 for d in review_feature: for w,b,in d.iteritems(): if type(w) is not types.TupleType: p_file.write(w.encode('utf-8') +'\t') else: for x in w: p_file.write(x.encode('utf-8') + '_') p_file.write('\n') p_file.close() end=time.clock() return reviewCount,end-start
def create_word_bigram_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' posdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 2, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_pos_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_pos_finder.nbest(BigramAssocMeasures.chi_sq, 5000) bigram_neg_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_neg_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def predictExcelDataSentTagProToExcel(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir): reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType preDataResPath=desDir+'/'+reviewDataSetName+'RawDataTagProFea.xls' start=time.clock() review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据 #将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') #提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl' #装载分类器 clf = pickle.load(open(classifierPath)) dataItemCount=len(sentiment_review) #分类之预测数据类标签 data_tag=clf.batch_classify(review_feature) #分类之预测数据积极、消极可能性 res_pro = clf.batch_prob_classify(review_feature) # 记录分类结果 积极可能性 消极可能性 # 记录原始数据 # 记录原始数据特征提取结果 # for d in review_feature: # for w,b,in d.iteritems(): # p_file.write(w.encode('utf-8') + ' '+str(b)+'\t') # p_file.write('\n') # p_file.close() preResFile=xlwt.Workbook(encoding='utf-8') preResSheet=preResFile.add_sheet('RawDataTagProFea') posProbility=[] for rowPos in range(dataItemCount): preResSheet.write(rowPos,0,review[rowPos])#原始数据 preResSheet.write(rowPos,1,data_tag[rowPos])#类标签 preResSheet.write(rowPos,2,str(res_pro[rowPos].prob('pos')))#积极概率 posProbility.append(res_pro[rowPos].prob('pos')) preResSheet.write(rowPos, 3, str(res_pro[rowPos].prob('neg')))#消极概率 feature='' #feature='_'.join(review_feature[rowPos].keys()) # print type(review_feature[rowPos].keys()), # 特征里面可能出现二元词的情况 for x in review_feature[rowPos].keys(): if type(x) is not nltk.types.TupleType: feature+=x else: feature+='_'.join(x) feature+=' ' preResSheet.write(rowPos, 4, feature)#特征 preResFile.save(preDataResPath) end=time.clock() print 'handle sentences num:', dataItemCount, ' classify time:', end-start return posProbility,preDataResPath,review
from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor from sklearn.neural_network import MLPClassifier from sklearn import cross_validation from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np '''1 导入数据模块''' # posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' # posNegPath=posNegDir+'/posNegLabelData.xls' # 标记数据所在路径保存在D:/ReviewHelpfulnessPrediction/LabelDataPath.txt文件中 posNegPath = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction/LabelDataPath.txt', 'line') print posNegPath pos_review = tp.seg_fil_senti_excel( posNegPath, 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) neg_review = tp.seg_fil_senti_excel( posNegPath, 2, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) print 'postive review num is:', len(pos_review), 'negtive review num is:', len( neg_review) shuffle(pos_review) shuffle(neg_review) pos = pos_review neg = neg_review """ # Cut positive review to make it the same number of nagtive review (optional)
def create_words_bigrams_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet' posdata = tp.seg_fil_senti_excel( posNegDir + '/pos_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) # 获取积极数据 # 去掉了一些停顿词,做了分词处理 # input sense 3.0 很棒,真机其实不错。sense 3.0 确实很漂亮,4.3寸 16:9的屏幕很霸气也清晰,整体运行很流畅。现在软件兼容的问 # output sense 3.0 很棒 真机 其实 不错 sense 3.0 确实 很漂亮 4.3 寸 16 9 屏幕 很 霸气 清晰 整体 运行 很 流畅 现在 软件 兼容 问题 几乎 # for x in posdata[1]: # print x, negdata = tp.seg_fil_senti_excel( posNegDir + '/neg_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) # for x in posdata[0]: # print x posWords = list(itertools.chain(*posdata)) # 把多维数组解链成一维数组 # print len(posWords) # for x in posWords: # print x, negWords = list(itertools.chain(*negdata)) # print len(negWords) # 把文本变成双词搭配的形式 bigram_finder = BigramCollocationFinder.from_words(posWords) # 使用卡方统计方法,选择排名前5000的双词 5000为自设置的一个阈值 posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) # for x in posBigrams: # for w in x: # print w, # print '' # print len(posBigrams) bigram_finder = BigramCollocationFinder.from_words(negWords) #posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) # for x in negBigrams: # for w in x: # print w, # print '' # print len(negBigrams) # 把所有词和双词搭配一起作为特征 pos = posWords + posBigrams # for x in pos: # print x neg = negWords + negBigrams # 进行特征选择 word_fd = FreqDist() # 统计所有词词频 cond_word_fd = ConditionalFreqDist() # 统计积极文本中词频和消极文本中词频 for word in pos: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in neg: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #获取积极词频数量 neg_word_count = cond_word_fd['neg'].N() #获取消极词频数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): #print word,freq pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 函数怎么计算的 不知道 # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 计算消极词的卡方统计量,这里也可以计算互信息等其它统计量 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores
import numpy import scipy from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist #import sklearn # 1. Load data reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx' review = tp.get_excel_data(reviewDataSetPath, 1, 4, "data") sentiment_review = tp.seg_fil_senti_excel( reviewDataSetPath, 1, 4, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier # 计算整个语料里面每个词和双词搭配的信息量 # 以单个词语和出现频率为前5000双词作为特征 # return : # 返回每个词以及得分 ''' return : 第五 1.64131573422 当是 4.8096346704 (u'\u624b\u52a8', u'\u5bfc\u5165') 0.831674969506 (u'\u4e4b\u8bcd', u'\u55b7') 0.831674969506 test code:
''' 注意事项 如果训练数据(标记数据)发生更改,需要修改特征提取模块下d提取积极消极可能性特征的create_word_bigram_scores()函数里面的以下部分: posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' posdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 1, 1,'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 2, 1,'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') ''' ''' 训练分类器大致过程如下:''' '''1 装载标记数据,数据预处理(分词及去停用词)''' '''2 提取特征(程度词性个数特征、句子个数及词语数量特征、基于词典的情感得分特征、积极消极可能性特征)''' '''3 训练分类器 ''' '''装载数据模块''' posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' posdata = tp.seg_fil_senti_excel( posNegDir + '/posNegLabelData.xls', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) negdata = tp.seg_fil_senti_excel( posNegDir + '/posNegLabelData.xls', 2, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) posRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 1, 1, 'data') negRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 2, 1, 'data') '''特征提取模块的函数''' '''a 提取形容词、副词、动词数量特征''' '''返回 形容词 副词 动词 特征列表[[adjNum,advNum,vNum],[],],其中参数rawData为原始数据列表(未经分词处理)''' '''在处理弹幕数据时,时间性能大致1s可以处理1000条数据(词性标注比较耗时 看看可否优化(tp.postagger(review, 'list')))'''