def cut_sentences_words(self, review): sent_words = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') #seg_sent = self.stopWordFilter(seg_sent) sent_words.append(seg_sent) return sent_words
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) #将每条评论进行分句,得到每条评论的 分句 words = tp.segmentation(review, 'list') #将每条评论进行分词,保存分词结果到列表数组中 sent_num = len(sents) #记录每条评论分句完后的句子数量 word_num = len(words) #记录每条评论的 分词后的 词数 sent_word = float(word_num) / float(sent_num) #评论的长度 = 分词数/分句数 word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) words = tp.segmentation(review,'list') sent_num = len(sents) word_num = len(words) sent_word = float(word_num)/float(sent_num) # review length = word number/sentence number word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) words = tp.segmentation(review, "list") sent_num = len(sents) word_num = len(words) sent_word = float(word_num) / float(sent_num) # review length = word number/sentence number word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: for sent in review: seg_sent = tp.segmentation(sent, "list") # 分词 i = 0 # word position counter a = 0 # sentiment word position poscount = 0 # count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == "!".decode("utf8") or word == "!".decode("utf8"): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break else: pos = tp.postagger(word, "list") for k in pos: if k[1] == "a": fo.write(word.encode("utf8") + "\n") elif k[1] == "d": fo.write(word.encode("utf8") + "\n") elif k[1] == "v": fo.write(word.encode("utf8") + "\n") elif k[1] == "n": fo.write(word.encode("utf8") + "\n") i += 1 single_review_count.append(transform_to_positive_num(poscount, negcount)) # 评论分句子后,每条句子的正负权值 all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] single_review_count = [] return all_review_count
def sentence_sentiment_score(dataset): dataset = dataset[1:10] cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: single_review_count = get_single_sent_count(review) all_review_count.append(single_review_count) return all_review_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: for sent in review: seg_sent = tp.segmentation(sent, 'list') i = 0 #word position counter a = 0 #sentiment word position poscount = 0 #count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == '!'.decode('utf8') or word == '!'.decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_count.append( transform_to_positive_num( poscount, negcount)) #[[s1_score], [s2_score], ...] all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] single_review_count = [] return all_review_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) single_review_count = [] all_review_count = [] for review in cuted_review: for sent in review: seg_sent = tp.segmentation(sent, "list") i = 0 # word position counter a = 0 # sentiment word position poscount = 0 # count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == "!".decode("utf8") or word == "!".decode("utf8"): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_count.append(transform_to_positive_num(poscount, negcount)) # [[s1_score], [s2_score], ...] all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] single_review_count = [] return all_review_count
def single_review_sentiment_score(review): # 可以计算每条评论中,正负指数。 single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) # cut sentence for sent in cuted_review: seg_sent = tp.segmentation(sent, "list") # cut word i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) # 句子中每句话正权值 a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) # 句子中每句话负权值 a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode("utf8") or word == "!".decode("utf8"): # 中英文!号 for w2 in seg_sent[::-1]: # 截取!号前的一个词 if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break else: fo.write(word) i += 1 # poscount,negcount 评论中每个句子的正负权值 single_review_senti_score.append(transform_to_positive_num(poscount, negcount)) review_sentiment_score = sumup_sentence_sentiment_score(single_review_senti_score) return review_sentiment_score
def single_review_sentiment_score(review): single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) review_sentiment_score = sumup_sentence_sentiment_score( single_review_senti_score) return review_sentiment_score
def single_review_sentiment_score(review): single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) a = i + 1 # Match "!" in the review, every "!" has a weight of +2 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append(transform_to_positive_num(poscount, negcount)) review_sentiment_score = sumup_sentence_sentiment_score(single_review_senti_score) return review_sentiment_score
def sentiment_score_list(dataset): cuted_data=[] for cell in dataset: cuted_data.append(tp.cut_sentence_2(cell)) count1=[] count2=[] for sents in cuted_data: # 遍历每一个评论 for sent in sents: # 循环遍历评论中的每一个分句 segtmp=tp.segmentation(sent,'list') # 把句子分词,以列表返回 i=0 a=0 poscount=0 poscount2 = 0 poscount3 = 0 negcount=0 negcount2 = 0 negcount3 = 0 for word in segtmp: if word in posdict: poscount+=1 c=0 for w in segtmp[a:i]: if w in mostdict: poscount*=4.0 elif w in verydict: poscount*=3.0 elif w in moredict: poscount*=2.0 elif w in ishdict: poscount/=2.0 elif w in insufficientdict: poscount/=4.0 elif w in inversedict: c+=1 if judgeodd(c)=='odd': poscount*=-1 poscount2+=poscount poscount=0 poscount3=poscount+poscount2+poscount3 poscount2=0 else: poscount3=poscount+poscount2+poscount3 poscount=0 a=i+1 elif word in negdict: negcount+=1 d=0 for w in segtmp[a:i]: if w in mostdict: negcount*=4.0 elif w in verydict: negcount*=3.0 elif w in moredict: negcount*=2.0 elif w in ishdict: negcount/=2.0 elif w in insufficientdict: negcount/=4.0 elif w in inversedict: d+=1 if judgeodd(d)=='odd': negcount*=-1.0 negcount2+=negcount negcount=0 negcount3=negcount+negcount2+negcount3 negcount2=0 else: negcount3=negcount+negcount2+negcount3 negcount=0 a=i+1 elif word=='|'.decode('utf-8') or word =='!'.decode('utf8'): for w2 in segtmp[::-1]: if w2 in posdict or negdict: poscount3+=2 negcount3+=2 break i+=1 pos_count=0 neg_count=0 if poscount3<0 and negcount3>0: neg_count+=negcount3-poscount3 pos_count=0 elif negcount3<0 and poscount3>0: pos_count=poscount3-negcount3 neg_count=0 elif negcount3<0 and poscount3<0: neg_count=-poscount3 pos_count=-negcount3 else: pos_count=poscount3 neg_count=negcount3 count1.append([pos_count,neg_count]) count2.append(count1) count1=[] return count2
def single_review_sentiment_score(review): cuted_review = tp.cut_sentence_2(review) single_review_senti_score = get_single_sent_count(cuted_review) review_sentiment_score = sumup_sentence_sentiment_score( single_review_senti_score) return review_sentiment_score