예제 #1
0
def create_word_scores():
    posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet'
    posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1,
                                        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1,
                                        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def predDataSentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir):
    reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType
    preResStorePath=desDir+'/'+reviewDataSetName+'ClassPro.txt'
    preTagStorePath=desDir+'/'+reviewDataSetName+'ClassTag.txt'
    start=time.clock()
    sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    #提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl'
    #装载分类器
    clf = pickle.load(open(classifierPath))
    #分类之预测数据类标签
    data_tag=clf.batch_classify(review_feature)
    p_file = open(preTagStorePath, 'w')
    for i in data_tag:
        p_file.write(str(i)+ '\n')
    p_file.close()
    #分类之预测数据积极、消极可能性
    pred = clf.batch_prob_classify(review_feature)
    # 记录分类结果 积极可能性 消极可能性
    p_file = open(preResStorePath, 'w')
    reviewCount = 0
    for i in pred:
        reviewCount += 1
        p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n')
    p_file.close()
    end=time.clock()
    return reviewCount,end-start
def predictDataSentimentPro(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir):
    reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType
    oriDataPath=desDir+'/'+reviewDataSetName+'OriData.txt'
    oriDataFeaPath = desDir + '/' + reviewDataSetName + 'OriFea.txt'
    preResStorePath=desDir+'/'+reviewDataSetName+'ClassPro.txt'
    preTagStorePath=desDir+'/'+reviewDataSetName+'ClassTag.txt'
    start=time.clock()
    #reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx'
    #reviewDataSetPath='D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet/pos_review.xlsx'
    review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据
    #将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    #提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath='D:/ReviewHelpfulnessPrediction\BuildedClassifier/'+str(best_classifier)[0:15]+'.pkl'
    #装载分类器
    clf = pickle.load(open(classifierPath))
    #分类之预测数据类标签
    data_tag=clf.batch_classify(review_feature)
    p_file = open(preTagStorePath, 'w')
    for i in data_tag:
        p_file.write(str(i)+ '\n')
    p_file.close()
    #分类之预测数据积极、消极可能性
    pred = clf.batch_prob_classify(review_feature)
    # 记录分类结果 积极可能性 消极可能性
    p_file = open(preResStorePath, 'w')
    reviewCount = 0
    for i in pred:
        reviewCount += 1
        p_file.write(str(i.prob('pos')) + '\t' + str(i.prob('neg')) + '\n')
    p_file.close()
    # 记录原始数据
    p_file = open(oriDataPath, 'w')
    for d in review:
        p_file.write(d.encode('utf-8')+'\n')
    p_file.close()
    p_file = open(oriDataFeaPath, 'w')
    # 记录原始数据特征提取结果
    for d in review_feature:
        for w,b,in d.iteritems():
            if type(w) is not types.TupleType:
                p_file.write(w.encode('utf-8') +'\t')
            else:
                for x in w:
                    p_file.write(x.encode('utf-8') + '_')
        p_file.write('\n')
    p_file.close()
    end=time.clock()
    return reviewCount,end-start
def create_word_bigram_scores():
    posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
    posdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 1, 1,
                                     'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    negdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 2, 1,
                                     'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_pos_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_pos_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    bigram_neg_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_neg_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def predictExcelDataSentTagProToExcel(reviewDataSetDir,reviewDataSetName,reviewDataSetFileType,sheetNum,colNum,desDir):
    reviewDataSetPath=reviewDataSetDir+'/'+reviewDataSetName+reviewDataSetFileType
    preDataResPath=desDir+'/'+reviewDataSetName+'RawDataTagProFea.xls'
    start=time.clock()
    review = tp.get_excel_data(reviewDataSetPath, sheetNum, colNum, "data")# 读取待分类数据
    #将待分类数据进行分词以及去停用词处理
    sentiment_review = tp.seg_fil_senti_excel(reviewDataSetPath, sheetNum, colNum, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    #提取待分类数据特征
    review_feature = extract_features(sentiment_review, best_words)
    #classifierPath = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature/sentiment_classifier.pkl'
    classifierPath = 'D:/ReviewHelpfulnessPrediction\BuildedClassifier/' + str(best_classifier)[0:15] + '.pkl'
    #装载分类器
    clf = pickle.load(open(classifierPath))
    dataItemCount=len(sentiment_review)
    #分类之预测数据类标签
    data_tag=clf.batch_classify(review_feature)
    #分类之预测数据积极、消极可能性
    res_pro = clf.batch_prob_classify(review_feature)
    # 记录分类结果 积极可能性 消极可能性
    # 记录原始数据
    # 记录原始数据特征提取结果
    # for d in review_feature:
    #     for w,b,in d.iteritems():
    #         p_file.write(w.encode('utf-8') + ' '+str(b)+'\t')
    #     p_file.write('\n')
    # p_file.close()
    preResFile=xlwt.Workbook(encoding='utf-8')
    preResSheet=preResFile.add_sheet('RawDataTagProFea')
    posProbility=[]
    for rowPos in range(dataItemCount):
        preResSheet.write(rowPos,0,review[rowPos])#原始数据
        preResSheet.write(rowPos,1,data_tag[rowPos])#类标签
        preResSheet.write(rowPos,2,str(res_pro[rowPos].prob('pos')))#积极概率
        posProbility.append(res_pro[rowPos].prob('pos'))
        preResSheet.write(rowPos, 3, str(res_pro[rowPos].prob('neg')))#消极概率
        feature=''
        #feature='_'.join(review_feature[rowPos].keys())
       # print type(review_feature[rowPos].keys()),
        # 特征里面可能出现二元词的情况
        for x in review_feature[rowPos].keys():
            if type(x) is not nltk.types.TupleType:
                feature+=x
            else:
                feature+='_'.join(x)
            feature+=' '
        preResSheet.write(rowPos, 4, feature)#特征
    preResFile.save(preDataResPath)
    end=time.clock()
    print 'handle sentences num:', dataItemCount, ' classify time:', end-start
    return posProbility,preDataResPath,review
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn import cross_validation
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
'''1 导入数据模块'''

# posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
# posNegPath=posNegDir+'/posNegLabelData.xls'
# 标记数据所在路径保存在D:/ReviewHelpfulnessPrediction/LabelDataPath.txt文件中
posNegPath = tp.get_txt_data(
    'D:/ReviewHelpfulnessPrediction/LabelDataPath.txt', 'line')
print posNegPath
pos_review = tp.seg_fil_senti_excel(
    posNegPath, 1, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
neg_review = tp.seg_fil_senti_excel(
    posNegPath, 2, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
print 'postive review num is:', len(pos_review), 'negtive review num is:', len(
    neg_review)

shuffle(pos_review)
shuffle(neg_review)

pos = pos_review
neg = neg_review
"""
# Cut positive review to make it the same number of nagtive review (optional)
예제 #7
0
def create_words_bigrams_scores():
    posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet'
    posdata = tp.seg_fil_senti_excel(
        posNegDir + '/pos_review.xlsx', 1, 1,
        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
    )
    # 获取积极数据
    # 去掉了一些停顿词,做了分词处理
    # input sense 3.0 很棒,真机其实不错。sense 3.0 确实很漂亮,4.3寸 16:9的屏幕很霸气也清晰,整体运行很流畅。现在软件兼容的问
    # output sense 3.0 很棒 真机 其实 不错 sense 3.0 确实 很漂亮 4.3 寸 16 9 屏幕 很 霸气 清晰 整体 运行 很 流畅 现在 软件 兼容 问题 几乎
    # for x in posdata[1]:
    #     print x,
    negdata = tp.seg_fil_senti_excel(
        posNegDir + '/neg_review.xlsx', 1, 1,
        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
    )
    # for x in posdata[0]:
    #     print x

    posWords = list(itertools.chain(*posdata))  # 把多维数组解链成一维数组
    # print len(posWords)
    # for x in posWords:
    #     print x,
    negWords = list(itertools.chain(*negdata))
    # print len(negWords)

    # 把文本变成双词搭配的形式
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    # 使用卡方统计方法,选择排名前5000的双词 5000为自设置的一个阈值
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    # for x in posBigrams:
    #     for w in x:
    #         print w,
    #     print ''
    # print len(posBigrams)
    bigram_finder = BigramCollocationFinder.from_words(negWords)

    #posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    # for x in negBigrams:
    #     for w in x:
    #         print w,
    #     print ''
    # print len(negBigrams)
    # 把所有词和双词搭配一起作为特征
    pos = posWords + posBigrams
    # for x in pos:
    #     print x
    neg = negWords + negBigrams
    # 进行特征选择
    word_fd = FreqDist()  # 统计所有词词频

    cond_word_fd = ConditionalFreqDist()  # 统计积极文本中词频和消极文本中词频
    for word in pos:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['pos'].inc(word)
        cond_word_fd['pos'][word] += 1
    for word in neg:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  #获取积极词频数量
    neg_word_count = cond_word_fd['neg'].N()  #获取消极词频数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        #print word,freq
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        # 函数怎么计算的 不知道
        # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        # 计算消极词的卡方统计量,这里也可以计算互信息等其它统计量
        word_scores[word] = pos_score + neg_score
        # 一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores
예제 #8
0
import numpy
import scipy
from random import shuffle

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

#import sklearn

# 1. Load data
reviewDataSetPath = 'D:/ReviewHelpfulnessPrediction\ReviewSet/HTC_Z710t_review_2013.6.5.xlsx'
review = tp.get_excel_data(reviewDataSetPath, 1, 4, "data")
sentiment_review = tp.seg_fil_senti_excel(
    reviewDataSetPath, 1, 4,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)

# 2. Feature extraction method
# Used for transform review to features, so it can calculate sentiment probability by classifier
# 计算整个语料里面每个词和双词搭配的信息量
# 以单个词语和出现频率为前5000双词作为特征
# return :
# 返回每个词以及得分
'''
return :
第五 1.64131573422
当是 4.8096346704
(u'\u624b\u52a8', u'\u5bfc\u5165') 0.831674969506
(u'\u4e4b\u8bcd', u'\u55b7') 0.831674969506
test code:
'''
                                       注意事项
如果训练数据(标记数据)发生更改,需要修改特征提取模块下d提取积极消极可能性特征的create_word_bigram_scores()函数里面的以下部分:
posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
posdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 1, 1,'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
negdata = tp.seg_fil_senti_excel(posNegDir + '/posNegLabelData.xls', 2, 1,'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
'''
'''  训练分类器大致过程如下:'''
'''1  装载标记数据,数据预处理(分词及去停用词)'''
'''2  提取特征(程度词性个数特征、句子个数及词语数量特征、基于词典的情感得分特征、积极消极可能性特征)'''
'''3  训练分类器 '''
'''装载数据模块'''

posNegDir = 'D:/ReviewHelpfulnessPrediction\LabelReviewData'
posdata = tp.seg_fil_senti_excel(
    posNegDir + '/posNegLabelData.xls', 1, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
negdata = tp.seg_fil_senti_excel(
    posNegDir + '/posNegLabelData.xls', 2, 1,
    'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt'
)
posRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 1, 1,
                               'data')
negRawData = tp.get_excel_data(posNegDir + '/posNegLabelData.xls', 2, 1,
                               'data')
'''特征提取模块的函数'''
'''a 提取形容词、副词、动词数量特征'''
'''返回 形容词 副词 动词 特征列表[[adjNum,advNum,vNum],[],],其中参数rawData为原始数据列表(未经分词处理)'''
'''在处理弹幕数据时,时间性能大致1s可以处理1000条数据(词性标注比较耗时 看看可否优化(tp.postagger(review, 'list')))'''