def is_single_review_sentiment(review): cuted_review = tp.cut_sentence_2(review)# 将评论切割成句子 for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list')# 将句子做分词处理 for word in seg_sent: if word in sentiment_dict: return True return False
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) # 切割成句子 words = tp.segmentation(review, 'list') #切割成词语 sent_num = len(sents) word_num = len(words) sent_word = float(word_num) / float( sent_num ) # review length = word number/sentence number 也即每个句子平均含有词语数量 word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def word_sent_count(rawData): begin=time.clock() word_sent_count = [] for review in rawData: sents = tp.cut_sentence_2(review)# 切割成句子 words = tp.segmentation(review,'list')#切割成词语 sent_num = len(sents) word_num = len(words) sent_word = float(word_num)/float(sent_num) # review length = word number/sentence number 也即每个句子平均含有词语数量 word_sent_count.append([word_num, sent_num, sent_word]) end=time.clock() print 'extract word_sent_count feature time is:', end - begin, 'handle data item num is:', len(rawData) return word_sent_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) all_review_count = [] for review in cuted_review: single_review_count = [] if len(review) == 0: #出现空行时 single_review_count.append(transform_to_positive_num(0, 0)) for sent in review: seg_sent = tp.segmentation(sent, 'list') i = 0 #word position counter a = 0 #sentiment word position poscount = 0 #count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == '!'.decode('utf8') or word == '!'.decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_count.append( transform_to_positive_num( poscount, negcount)) #[[s1_score], [s2_score], ...] all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] return all_review_count
def single_review_sentiment_score(review): single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) # 将评论切割成句子 for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') # 将句子做分词处理 i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) s = i + 1 # a是什么 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) s = i + 1 # a是什么 # Match "!" in the review, every "!" has a weight of +2 !强调句子情感 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) #print(sumup_sentence_sentiment_score(single_review_senti_score)) review_sentiment_score = sumup_sentence_sentiment_score( single_review_senti_score) return review_sentiment_score