def seg_filter_txt(filepath, storepath): txtfile = open(filepath, 'r') txtdata = txtfile.readlines() txtfile.close() review_data = tp.segmentation(txtdata[0], 'list') stopfile = open('D:/code/seg_fil_test/stopword.txt', 'r') stopdata1 = stopfile.readlines() stopdata2 = ''.join(stopdata1) stopwords = stopdata2.decode('utf8').split('\n') stopfile.close() seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] fil_file = open(storepath, 'w') for word in seg_fil_result: fil_file.write(word.encode('utf8') + ' ') fil_file.close()
def is_single_review_sentiment(review): cuted_review = tp.cut_sentence_2(review)# 将评论切割成句子 for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list')# 将句子做分词处理 for word in seg_sent: if word in sentiment_dict: return True return False
def word_sent_count(dataset): word_sent_count = [] for review in dataset: sents = tp.cut_sentence_2(review) # 切割成句子 words = tp.segmentation(review, 'list') #切割成词语 sent_num = len(sents) word_num = len(words) sent_word = float(word_num) / float( sent_num ) # review length = word number/sentence number 也即每个句子平均含有词语数量 word_sent_count.append([word_num, sent_num, sent_word]) return word_sent_count
def word_sent_count(rawData): begin=time.clock() word_sent_count = [] for review in rawData: sents = tp.cut_sentence_2(review)# 切割成句子 words = tp.segmentation(review,'list')#切割成词语 sent_num = len(sents) word_num = len(words) sent_word = float(word_num)/float(sent_num) # review length = word number/sentence number 也即每个句子平均含有词语数量 word_sent_count.append([word_num, sent_num, sent_word]) end=time.clock() print 'extract word_sent_count feature time is:', end - begin, 'handle data item num is:', len(rawData) return word_sent_count
def sentence_sentiment_score(dataset): cuted_review = [] for cell in dataset: cuted_review.append(tp.cut_sentence_2(cell)) all_review_count = [] for review in cuted_review: single_review_count = [] if len(review) == 0: #出现空行时 single_review_count.append(transform_to_positive_num(0, 0)) for sent in review: seg_sent = tp.segmentation(sent, 'list') i = 0 #word position counter a = 0 #sentiment word position poscount = 0 #count a pos word negcount = 0 for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[a:i]: poscount = match(w, poscount) a = i + 1 elif word in negdict: negcount += 1 for w in seg_sent[a:i]: negcount = match(w, negcount) a = i + 1 elif word == '!'.decode('utf8') or word == '!'.decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_count.append( transform_to_positive_num( poscount, negcount)) #[[s1_score], [s2_score], ...] all_review_count.append( single_review_count ) # [[[s11_score], [s12_score], ...], [[s21_score], [s22_score], ...], ...] return all_review_count
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data('D:/ReviewHelpfulnessPrediction\PreprocessingModule/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != ' '] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def single_review_sentiment_score(review): single_review_senti_score = [] cuted_review = tp.cut_sentence_2(review) # 将评论切割成句子 for sent in cuted_review: seg_sent = tp.segmentation(sent, 'list') # 将句子做分词处理 i = 0 # word position counter s = 0 # sentiment word position poscount = 0 # count a positive word negcount = 0 # count a negative word for word in seg_sent: if word in posdict: poscount += 1 for w in seg_sent[s:i]: poscount = match(w, poscount) s = i + 1 # a是什么 elif word in negdict: negcount += 1 for w in seg_sent[s:i]: negcount = match(w, negcount) s = i + 1 # a是什么 # Match "!" in the review, every "!" has a weight of +2 !强调句子情感 elif word == "!".decode('utf8') or word == "!".decode('utf8'): for w2 in seg_sent[::-1]: if w2 in posdict: poscount += 2 break elif w2 in negdict: negcount += 2 break i += 1 single_review_senti_score.append( transform_to_positive_num(poscount, negcount)) #print(sumup_sentence_sentiment_score(single_review_senti_score)) review_sentiment_score = sumup_sentence_sentiment_score( single_review_senti_score) return review_sentiment_score