예제 #1
0
def seg_filter_txt(filepath, storepath):
    txtfile = open(filepath, 'r')
    txtdata = txtfile.readlines()
    txtfile.close()

    review_data = tp.segmentation(txtdata[0], 'list')

    stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r')
    stopdata1 = stopfile.readlines()
    stopdata2 = ''.join(stopdata1)
    stopwords = stopdata2.decode('utf8').split('\n')
    stopfile.close()

    seg_fil_result = []
    for review in review_data:
        fil = [
            word for word in review if word not in stopwords and word != ' '
        ]
        seg_fil_result.append(fil)
        fil = []

    fil_file = open(storepath, 'w')
    for word in seg_fil_result:
        fil_file.write(word.encode('utf8') + ' ')
    fil_file.close()
def is_single_review_sentiment(review):
	cuted_review = tp.cut_sentence_2(review)# 将评论切割成句子
	for sent in cuted_review:
		seg_sent = tp.segmentation(sent, 'list')# 将句子做分词处理
		for word in seg_sent:
		    if word in  sentiment_dict:
		        return True
	return False
def word_sent_count(dataset):
    word_sent_count = []
    for review in dataset:
        sents = tp.cut_sentence_2(review)  # 切割成句子
        words = tp.segmentation(review, 'list')  #切割成词语
        sent_num = len(sents)
        word_num = len(words)
        sent_word = float(word_num) / float(
            sent_num
        )  # review length = word number/sentence number 也即每个句子平均含有词语数量
        word_sent_count.append([word_num, sent_num, sent_word])
    return word_sent_count
def word_sent_count(rawData):
    begin=time.clock()
    word_sent_count = []
    for review in rawData:
        sents = tp.cut_sentence_2(review)# 切割成句子
        words = tp.segmentation(review,'list')#切割成词语
        sent_num = len(sents)
        word_num = len(words)
        sent_word = float(word_num)/float(sent_num)  # review length = word number/sentence number 也即每个句子平均含有词语数量
        word_sent_count.append([word_num, sent_num, sent_word])
    end=time.clock()
    print 'extract word_sent_count feature time is:', end - begin, 'handle data item num is:', len(rawData)
    return word_sent_count
예제 #5
0
def sentence_sentiment_score(dataset):
    cuted_review = []
    for cell in dataset:
        cuted_review.append(tp.cut_sentence_2(cell))

    all_review_count = []
    for review in cuted_review:
        single_review_count = []
        if len(review) == 0:  #出现空行时
            single_review_count.append(transform_to_positive_num(0, 0))
        for sent in review:
            seg_sent = tp.segmentation(sent, 'list')
            i = 0  #word position counter
            a = 0  #sentiment word position
            poscount = 0  #count a pos word
            negcount = 0
            for word in seg_sent:
                if word in posdict:
                    poscount += 1
                    for w in seg_sent[a:i]:
                        poscount = match(w, poscount)
                    a = i + 1

                elif word in negdict:
                    negcount += 1
                    for w in seg_sent[a:i]:
                        negcount = match(w, negcount)
                    a = i + 1

                elif word == '!'.decode('utf8') or word == '!'.decode('utf8'):
                    for w2 in seg_sent[::-1]:
                        if w2 in posdict:
                            poscount += 2
                            break
                        elif w2 in negdict:
                            negcount += 2
                            break
                i += 1

            single_review_count.append(
                transform_to_positive_num(
                    poscount, negcount))  #[[s1_score], [s2_score], ...]
        all_review_count.append(
            single_review_count
        )  # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...]

    return all_review_count
def word_by_word_review(filepath, sheetnum, colnum):
    # Read product review data from excel file and segment every review
    review_data = []
    for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]:
        review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew
    
    # Read txt file contain stopwords
    stopwords = tp.get_txt_data('D:/ReviewHelpfulnessPrediction\PreprocessingModule/stopword.txt', 'lines')
 
    # Filter stopwords from reviews
    seg_fil_result = []
    for review in review_data:
        fil = [word for word in review if word not in stopwords and word != ' ']
        seg_fil_result.append(fil)
        fil = []

    # Return review set as onedimentional list
    review = list(itertools.chain(*seg_fil_result))
    return review
def single_review_sentiment_score(review):
    single_review_senti_score = []
    cuted_review = tp.cut_sentence_2(review)  # 将评论切割成句子

    for sent in cuted_review:
        seg_sent = tp.segmentation(sent, 'list')  # 将句子做分词处理
        i = 0  # word position counter
        s = 0  # sentiment word position
        poscount = 0  # count a positive word
        negcount = 0  # count a negative word

        for word in seg_sent:
            if word in posdict:
                poscount += 1
                for w in seg_sent[s:i]:
                    poscount = match(w, poscount)
                s = i + 1  # a是什么

            elif word in negdict:
                negcount += 1
                for w in seg_sent[s:i]:
                    negcount = match(w, negcount)
                s = i + 1  # a是什么

            # Match "!" in the review, every "!" has a weight of +2 !强调句子情感
            elif word == "!".decode('utf8') or word == "!".decode('utf8'):
                for w2 in seg_sent[::-1]:
                    if w2 in posdict:
                        poscount += 2
                        break
                    elif w2 in negdict:
                        negcount += 2
                        break
            i += 1

        single_review_senti_score.append(
            transform_to_positive_num(poscount, negcount))
        #print(sumup_sentence_sentiment_score(single_review_senti_score))
    review_sentiment_score = sumup_sentence_sentiment_score(
        single_review_senti_score)

    return review_sentiment_score