def unionDict(dict1Path,dict2Path): dict1=tp.get_txt_data(dict1Path,'lines') dict2=tp.get_txt_data(dict2Path,'lines') dict={} for x in dict1: dict.setdefault(x,1) for x in dict2: if dict.has_key(x)==False: dict.setdefault(x,1) else: dict[x]+=1 print len(dict1),len(dict2) print len(dict) return dict
def testLabelDataAcc(): begin = time.clock() '''获得原始数据路径''' # reviewDataSetDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' # reviewDataSetName = 'posNegLabelData' # reviewDataSetFileType = '.xls' #dataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType dataSetPath = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction/LabelDataPath.txt', 'line') reviewDataSetDir, reviewDataSetName, reviewDataSetFileType = parseFilePath( dataSetPath) '''获得目标数据路径''' dstSavePath = reviewDataSetDir + '/' + reviewDataSetName + 'BasedDictSentimentScore.txt' '''获得原始数据''' posreview = tp.get_excel_data(dataSetPath, 1, 1, "data") negreview = tp.get_excel_data(dataSetPath, 2, 1, "data") review = posreview + negreview '''得到每句评论[[PosSum, NegSum],[],]''' sentiment_score_list = get_review_set_sentiement_score(review) '''得到每句评论的整体得分''' sentiment_overall_score = get_sentiment_overall_score_to_txt( sentiment_score_list, review, dstSavePath) labelClass = [] for pos in range(len(posreview)): labelClass.append(1) for pos in range(len(negreview)): labelClass.append(0) # for pos in range(len(sentiment_overall_score)): # print sentiment_score_list[pos],sentiment_overall_score[pos],labelClass[pos] finalAcc = getAccuracy(sentiment_overall_score, labelClass) print 'sentiment Analyze Based Dictionary Accuracy:', finalAcc, 'data item num:', len( review) return finalAcc, len(review)
def remove_duplicate_comment(srcpath, para, excelpath): begin = time.clock() raw_data = tp.get_txt_data(srcpath, para) review_diff_set = {} pre_count = len(raw_data) cur_count = 0 for x in raw_data: if review_diff_set.has_key(x) == False: review_diff_set[x] = 1 cur_count += 1 else: review_diff_set[x] += 1 excel_file = xlwt.Workbook(encoding='utf-8') sheet_name = 'label_data' sheet_pos = 1 excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos)) row_pos = 0 excel_sheet.write(row_pos, 0, 'review_data') excel_sheet.write(row_pos, 1, 'review_count') row_pos += 1 for w, c in review_diff_set.iteritems(): if row_pos == 65536: sheet_pos += 1 excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos)) row_pos = 0 excel_sheet.write(row_pos, 0, 'review_data') excel_sheet.write(row_pos, 1, 'review_count') row_pos += 1 excel_sheet.write(row_pos, 0, w) excel_sheet.write(row_pos, 1, str(c)) row_pos += 1 excel_file.save(excelpath) end = time.clock() print 'remove same reviews time:', end - begin, 'handle review num:', pre_count, 'different review num:', cur_count return pre_count, cur_count
def change_txt_to_excel(srcpath,para,excelpath): begin = time.clock() raw_data = tp.get_txt_data(srcpath, para) excel_file = xlwt.Workbook(encoding='utf-8') sheet_name = 'label_data' sheet_pos = 1 excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos)) row_pos = 0 excel_sheet.write(row_pos, 0, 'review_data') excel_sheet.write(row_pos, 1, 'review_count') excel_sheet.write(row_pos, 2, 'is_subjective') excel_sheet.write(row_pos, 3, 'sentiment_tendency') excel_sheet.write(row_pos, 4, 'is_erotic') excel_sheet.write(row_pos, 5, 'key_words') row_pos += 1 for w in raw_data: if row_pos == 65536: sheet_pos += 1 excel_sheet = excel_file.add_sheet(sheet_name + str(sheet_pos)) row_pos = 0 excel_sheet.write(row_pos, 0, 'review_data') excel_sheet.write(row_pos, 1, 'review_count') row_pos += 1 excel_sheet.write(row_pos, 0, w) excel_sheet.write(row_pos, 1, str(1)) row_pos += 1 excel_file.save(excelpath) end = time.clock() print 'remove same reviews time:', end - begin, 'handle review num:', len(raw_data)
def read_txt_review_set_and_store_score(dataSetDir, dataSetName, dataSetFileType, dstDir): start = time.clock() dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType dstPath = dstDir + '/' + dataSetName + 'SentiDictFea.txt' review = tp.get_txt_data(dataSetPath, "lines") # for x in review: # print x res = store_sentiment_dictionary_score(review, dstPath) end = time.clock() return res, end - start
def filt_objective_sentence(srcpath,para,dstpath): begin=time.clock() raw_data=tp.get_txt_data(srcpath,para) f = open(dstpath, 'w') count=0 for x in raw_data: if is_single_review_sentiment(x)==True: f.write(x.encode('utf-8') + '\n') count+=1 f.close() end=time.clock() print 'filt objective reviews time:',end-begin,'handle review num:',len(raw_data),'subjective review num:',count return count
def sentiAnalyzeBaseDict(reviewDataSetName, reviewDataSetFileType, windowSize, posBounder, negBounder, sentScoreBounder, timeInterval=20): begin = time.clock() '''获得原始数据路径''' reviewDataSetDir = 'D:/ReviewHelpfulnessPrediction\BulletData' saveResPath = 'D:/ReviewHelpfulnessPrediction/PredictClassRes' dataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType figDir = 'D:/ReviewHelpfulnessPrediction\SentimentLineFig' '''获得目标数据路径''' dstSavePath = saveResPath + '/' + reviewDataSetName + 'BasedDictSentimentScore.txt' '''获得原始数据''' review = tp.get_txt_data(dataSetPath, "lines") '''得到每句评论[[PosSum, NegSum],[],]''' sentiment_score_list = get_review_set_sentiement_score(review) '''得到每句评论的整体得分''' sentiment_overall_score = get_sentiment_overall_score_to_txt( sentiment_score_list, review, dstSavePath) '''分析评论情感得分数据 按照窗口迭代 获得 情感值 积极比率 消极比率 异常话语位置''' # posBounder=0.6 # negBounder=0.4 sentimentValueList, posRatioList, negRatioList, strangeWordPos = analyzeSentimentProList( sentiment_overall_score, windowSize, posBounder, negBounder, sentScoreBounder) '''合并重叠区间''' finalStrangeWordPos = unionStrangeWordPos(strangeWordPos) '''获得平均情感值''' meanSentPosPro = getMeanSentimentValue(sentiment_overall_score) print 'mean sentiment postive probility', meanSentPosPro overallPosRatio = getOverallPosRatio(sentiment_overall_score, posBounder) overallNegRatio = getOverallNegRatio(sentiment_overall_score, negBounder) '''输出异常话语位置''' outputStrangeWordPosInTxt(finalStrangeWordPos, dstSavePath) '''绘制情感曲线图''' drawSentimentLine(sentimentValueList, figDir + '/' + reviewDataSetName + 'SentCurveDA.png') drawPosNegRatioPie(overallPosRatio, overallNegRatio, figDir + '/' + reviewDataSetName + 'PosNegRatioDA.png') '''输出异常话语''' outputStrangeWords(finalStrangeWordPos, review) '''绘制情感波动动态图''' #drawSentimentChangeLine(sentimentValueList, timeInterval, windowSize, -30, 30) end = time.clock() print 'sentiment Analyze based dict running time:', end - begin, 'handle review num:', len( review)
def extractFeaPreUnlabelTxtData(rawDataPath,preResPath): begin=time.clock() '''获取原始数据列表''' unlabedRawData = tp.get_txt_data(rawDataPath, 'lines') '''获取经分词及去停用词处理后的数据列表''' unlabedSegFiltData = tp.seg_fil_txt(rawDataPath,'lines') '''提取数据特征''' dataAllFea = extractAllFea(unlabedRawData, unlabedSegFiltData) '''读取最佳分类器(最佳分类器名字位于D:/ReviewHelpfulnessPrediction\BuildedClassifier/bestClassifierAcc.txt里面)''' bestClassifier = read_best_classifier() print bestClassifier '''装载分类器,预测分类结果''' loadClassifierPreRes(bestClassifier, unlabedRawData, dataAllFea, preResPath) end=time.clock() print 'extract feature and predict data time is:',end-begin,'handle data item num is:',len(unlabedRawData)
def get_all_trainset(dimension): best_words = find_best_words(dimension) #增加标注的关键词 select_key_words = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\KeyWords/PosNegKeyWords.txt', 'lines') for x in select_key_words: best_words.add(x) posFeatures = pos_features(best_word_features_com, best_words) #提取积极文本里面的数据 negFeatures = neg_features(best_word_features_com, best_words) #提取消极文本里面的数据 shuffle(posFeatures) # 将序列的所有元素随机排列 shuffle(negFeatures) train_set = posFeatures + negFeatures return train_set
def predictTxtDataSentTagProToExcel(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,desDir): reviewDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType preDataResPath = desDir + '/' + reviewDataSetName + 'RawDataTagProFea.xls' start = time.clock() review = tp.get_txt_data(reviewDataSetPath, "lines") # 读取待分类数据 # 将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_txt(reviewDataSetPath,'lines') # 提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) # classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl' # 装载分类器 clf = pickle.load(open(classifierPath)) dataItemCount = len(sentiment_review) # 分类之预测数据类标签 data_tag = clf.batch_classify(review_feature) # 分类之预测数据积极、消极可能性 res_pro = clf.batch_prob_classify(review_feature) preResFile = xlwt.Workbook(encoding='utf-8') sheetName='RawDataTagProFea' sheetPos=0 preResSheet = preResFile.add_sheet(sheetName+str(sheetPos)) posProbility = [] excelRowPos=0 for rowPos in range(dataItemCount): if excelRowPos==65536: sheetPos+=1 preResSheet=preResFile.add_sheet(sheetName+str(sheetPos)) excelRowPos=0 preResSheet.write(excelRowPos, 0, review[rowPos]) # 原始数据 preResSheet.write(excelRowPos, 1, data_tag[rowPos]) # 类标签 preResSheet.write(excelRowPos, 2, str(res_pro[rowPos].prob('pos'))) # 积极概率 posProbility.append(res_pro[rowPos].prob('pos')) preResSheet.write(excelRowPos, 3, str(res_pro[rowPos].prob('neg'))) # 消极概率 feature = '' # 特征里面可能出现二元词的情况 for x in review_feature[rowPos].keys(): if type(x) is not nltk.types.TupleType: feature += x else: feature += '_'.join(x) feature += ' ' preResSheet.write(excelRowPos, 4, feature) # 特征 excelRowPos+=1 preResFile.save(preDataResPath) end = time.clock() print 'handle sentences num:', dataItemCount, ' classify time:', end - start return posProbility,preDataResPath,review
def predTxtDataSentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,desDir): reviewDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType oriDataPath = desDir + '/' + reviewDataSetName + 'OriData.txt' oriDataFeaPath = desDir + '/' + reviewDataSetName + 'OriFea.txt' preResStorePath = desDir + '/' + reviewDataSetName + 'ClassPro.txt' preTagStorePath = desDir + '/' + reviewDataSetName + 'ClassTag.txt' start = time.clock() # reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx' # reviewDataSetPath='D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet/pos_review.xlsx' review = tp.get_txt_data(reviewDataSetPath, "lines") # 读取待分类数据 # 将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_txt(reviewDataSetPath,'lines') # 提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) # classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl' # 装载分类器 clf = pickle.load(open(classifierPath)) # 分类之预测数据类标签 data_tag = clf.batch_classify(review_feature) p_file = open(preTagStorePath, 'w') for i in data_tag: p_file.write(str(i) + '\n') p_file.close() # 分类之预测数据积极、消极可能性 pred = clf.batch_prob_classify(review_feature) # 记录分类结果 积极可能性 消极可能性 p_file = open(preResStorePath, 'w') reviewCount = 0 for i in pred: reviewCount += 1 p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n') p_file.close() # 记录原始数据 p_file = open(oriDataPath, 'w') for d in review: p_file.write(d.encode('utf-8') + '\n') p_file.close() p_file = open(oriDataFeaPath, 'w') # 记录原始数据特征提取结果 for d in review_feature: for w, b, in d.iteritems(): p_file.write(w.encode('utf-8') + ' ' + str(b) + '\t') p_file.write('\n') p_file.close() end = time.clock() return reviewCount, end - start
def sentiAnalyzeBaseDictUI(reviewDataSetDir, reviewDataSetName, reviewDataSetFileType, windowSize, posBounder, negBounder, sentScoreBounder, timeInterval=20): begin = time.clock() desDir = 'D:/ReviewHelpfulnessPrediction/PredictClassRes' figDir = 'D:/ReviewHelpfulnessPrediction\SentimentLineFig' strangeWordDir = 'D:/ReviewHelpfulnessPrediction\StrangeWords' rawDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType strangeWordPath = strangeWordDir + '/' + reviewDataSetName + 'DA.txt' classifyResPath = desDir + '/' + reviewDataSetName + 'DA.txt' sentimentLinePath = figDir + '/' + reviewDataSetName + 'SCDA.png' posNegRatioPath = figDir + '/' + reviewDataSetName + 'PNRDA.png' review = tp.get_txt_data(rawDataSetPath, "lines") '''得到每句评论[[PosSum, NegSum],[],]''' sentiment_score_list = get_review_set_sentiement_score(review) '''得到每句评论的整体得分''' sentiment_overall_score = get_sentiment_overall_score_to_txt( sentiment_score_list, review, classifyResPath) '''分析评论情感得分数据 按照窗口迭代 获得 情感值 积极比率 消极比率 异常话语位置''' sentimentValueList, posRatioList, negRatioList, strangeWordPos = analyzeSentimentProList( sentiment_overall_score, windowSize, posBounder, negBounder, sentScoreBounder) finalStrangeWordPos = unionStrangeWordPos(strangeWordPos) #meanSentPosPro = getMeanSentimentValue(sentiment_overall_score) overallPosRatio = getOverallPosRatio(sentiment_overall_score, posBounder) overallNegRatio = getOverallNegRatio(sentiment_overall_score, negBounder) drawSentimentLine(sentimentValueList, sentimentLinePath) drawPosNegRatioPie(overallPosRatio, overallNegRatio, posNegRatioPath) saveStrangeWordsToTxt(finalStrangeWordPos, review, strangeWordPath) end = time.clock() print 'sentiment Analyze based dict running time:', end - begin, 'handle review num:', len( review) return strangeWordPath, sentimentLinePath, classifyResPath
def storeTxtReviewSenValue(dataSetDir, dataSetName, dataSetFileType, dstDir): start = time.clock() dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType dstPath = dstDir + '/' + dataSetName + 'SnowNLPSentiment.txt' reviewSet = tp.get_txt_data(dataSetPath, 'lines') reviewSentiment = [] for review in reviewSet: if review == '': continue s = SnowNLP(review) reviewSentiment.append(s.sentiments) reviewNum = 0 f = open(dstPath, 'w') for x in reviewSentiment: f.write(str(x) + '\n') reviewNum += 1 f.close() end = time.clock() return reviewNum, end - start
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data('D:/ReviewHelpfulnessPrediction\PreprocessingModule/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != ' '] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def get_trainset_testset_testtag(dimension): best_words = find_best_words(dimension) #增加标注的关键词 select_key_words = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\KeyWords/PosNegKeyWords.txt', 'lines') for x in select_key_words: best_words.add(x) posFeatures = pos_features(best_word_features_com, best_words) #提取积极文本里面的数据 negFeatures = neg_features(best_word_features_com, best_words) #提取消极文本里面的数据 # shuffle(posFeatures) # 将序列的所有元素随机排列 # shuffle(negFeatures) train_pos = int(len(pos_review) * 0.8) train_neg = int(len(neg_review) * 0.8) train_set_pos = posFeatures[:train_pos] train_set_neg = negFeatures[:train_neg] test_set = posFeatures[train_pos:] + negFeatures[train_neg:] test_fea, test_tag = zip(*test_set) # 将特征和类标签分离开 return train_set_pos, train_set_neg, test_fea, test_tag
def storeTxtReviewSenValue(dataSetDir,dataSetName,dataSetFileType,dstDir): start=time.clock() dataSetPath = dataSetDir + '/' + dataSetName + dataSetFileType dstPath = dstDir + '/' + dataSetName + 'SnowNLPSentiment.txt' reviewSet=tp.get_txt_data(dataSetPath,'lines') reviewSentiment=[] rawReview=[] for review in reviewSet: if review=='': continue s=SnowNLP(review) rawReview.append(review) reviewSentiment.append(s.sentiments) reviewNum=0 f=open(dstPath,'w') for pos in range(len(reviewSentiment)): f.write(str(rawReview[pos].encode('utf-8'))+'\t'+str(reviewSentiment[pos])+'\n') reviewNum+=1 f.close() end=time.clock() return reviewNum,end-start
def predictTxtDataSentTagProToTxt(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,desDir): reviewDataSetPath = reviewDataSetDir + '/' + reviewDataSetName + reviewDataSetFileType preDataResPath = desDir + '/' + reviewDataSetName + 'RawDataTagProFea.txt' start = time.clock() review = tp.get_txt_data(reviewDataSetPath, "lines") # 读取待分类数据 # 将待分类数据进行分词以及去停用词处理 sentiment_review = tp.seg_fil_txt(reviewDataSetPath,'lines') # 提取待分类数据特征 review_feature = extract_features(sentiment_review, best_words) # classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl' classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl' # 装载分类器 clf = pickle.load(open(classifierPath)) dataItemCount = len(sentiment_review) # 分类之预测数据类标签 data_tag = clf.batch_classify(review_feature) # 分类之预测数据积极、消极可能性 res_pro = clf.batch_prob_classify(review_feature) preResFile = open(preDataResPath,'w') posProbility = [] for rowPos in range(dataItemCount): posProbility.append(res_pro[rowPos].prob('pos')) feature = '' # 特征里面可能出现二元词的情况 for x in review_feature[rowPos].keys(): if type(x) is not nltk.types.TupleType: feature += x else: feature += '_'.join(x) feature += ' ' # preResFile.write( # review[rowPos].encode('utf-8') + '\t' + str(data_tag[rowPos])) preResFile.write( review[rowPos].encode('utf-8') + '\t' + str(data_tag[rowPos]) + '\t' + str(res_pro[rowPos].prob('pos')) + '\t' + str( res_pro[rowPos].prob('neg'))+'\t'+feature.encode('utf-8')+'\n') preResFile.close() end = time.clock() print 'handle sentences num:', dataItemCount, ' classify time:', end - start return posProbility,preDataResPath,review
积极消极标记数据 posNegLabelData.xls 主客观标记数据 subObjLabelData.xls 鉴黄标记数据 eroNorLabelData.xls ''' import textProcessing as tp import numpy as np import time import xlwt import xlrd import chardet import os '''导入情感词典''' dictDir='D:/ReviewHelpfulnessPrediction\SentimentDict' posdict = tp.get_txt_data(dictDir+"/posdict.txt","lines") negdict = tp.get_txt_data(dictDir+"/negdict.txt","lines") '''过滤器 过滤掉不含主观情感的语句 客观语句''' '''构建情感词典 这里只简单地分为积极和消极''' sentiment_dict=posdict+negdict '''判断单条评论是否为具备情感倾向语句 如果评论里有一个词位于情感词典中,则可认为该句具备情感倾向''' def is_single_review_sentiment(review): cuted_review = tp.cut_sentence_2(review)# 将评论切割成句子 for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list')# 将句子做分词处理 for word in seg_sent: if word in sentiment_dict: return True
·计算文章的情感得分 ·考虑到语句中的褒贬并非稳定分布,以上步骤对于积极和消极的情感词分开执行,最终的到两个分值,分别表示文本的正向情感值和负向情感值。 ''' import textProcessing as tp import numpy as np import time import xlwt import xlrd from matplotlib import pyplot as plt from matplotlib import animation '''1 导入情感词典''' '''导入情感词典''' begin = time.clock() dictDir = 'D:/ReviewHelpfulnessPrediction\SentimentDict' posdict = tp.get_txt_data(dictDir + "/posdict.txt", "lines") negdict = tp.get_txt_data(dictDir + "/negdict.txt", "lines") '''导入形容词、副词、否定词等程度词字典''' mostdict = tp.get_txt_data(dictDir + '/most.txt', 'lines') verydict = tp.get_txt_data(dictDir + '/very.txt', 'lines') moredict = tp.get_txt_data(dictDir + '/more.txt', 'lines') ishdict = tp.get_txt_data(dictDir + '/ish.txt', 'lines') insufficientdict = tp.get_txt_data(dictDir + '/insufficiently.txt', 'lines') inversedict = tp.get_txt_data(dictDir + '/inverse.txt', 'lines') end = time.clock() print 'load dictionary time:', end - begin '''2 基于字典的情感分析 基本功能''' '''匹配程度词并设置权重''' '''parm:word 当前情感词的前面词语 sentiment_value 当前情感词的情感值'''
from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor from sklearn.neural_network import MLPClassifier from sklearn import cross_validation from sklearn.metrics import f1_score, precision_score, recall_score import numpy as np '''1 导入数据模块''' # posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData' # posNegPath=posNegDir+'/posNegLabelData.xls' # 标记数据所在路径保存在D:/ReviewHelpfulnessPrediction/LabelDataPath.txt文件中 posNegPath = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction/LabelDataPath.txt', 'line') print posNegPath pos_review = tp.seg_fil_senti_excel( posNegPath, 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) neg_review = tp.seg_fil_senti_excel( posNegPath, 2, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt' ) print 'postive review num is:', len(pos_review), 'negtive review num is:', len( neg_review) shuffle(pos_review) shuffle(neg_review)
""" ''' 计算一条评论 积极、消极得分,平均得分,标准偏差 模块目标是提取一条评论的 positive/negative score, average score and standard deviation features (all 6 features) 情感分析依赖于情感词典 ''' import textProcessing as tp import numpy as np import time import xlwt import xlrd '''1 导入情感词典以及数据集''' '''导入情感词典''' posdict = tp.get_txt_data( "D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\PositiveAndNegativeDictionary/posdict.txt", "lines") negdict = tp.get_txt_data( "D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\PositiveAndNegativeDictionary/negdict.txt", "lines") '''导入形容词、副词、否定词等程度词字典''' mostdict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/most.txt', 'lines') verydict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/very.txt', 'lines') moredict = tp.get_txt_data( 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\SentimentDictionaryFeatures\SentimentDictionary\AdverbsOfDegreeDictionary/more.txt', 'lines') ishdict = tp.get_txt_data(