def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data( filepath, sheetnum, colnum, 'data')[0:tp.get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords """ stopwords = tp.get_txt_data('D:/code/stopword.txt', 'lines') """ stopwords = tp.get_txt_data( 'E:/GraduationProject/pythoncode/project/Prediction/main/PreprocessingModule/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data( filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data( '/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [ word for word in review if word not in stopwords and word != ' ' ] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
def store_word_sent_num_features(filepath, sheetnum, colnum, data, storepath): data = tp.get_excel_data(filepath, sheetnum, colnum, 'data') word_sent_num = word_sent_count(data) #需要初始化 print word_sent_num f = open(storepath, 'w') for i in word_sent_num: f.write(str(i[0]) + ' ' + str(i[1]) + ' ' + str(i[2]) + '\n') f.close()
def store_adj_adv_v_num_feature(filepath, sheetnum, colnum, data, storepath): data = tp.get_excel_data(filepath, sheetnum, colnum, 'data') adj_adv_num = count_adj_adv(data) f = open(storepath, 'w') for i in adj_adv_num: f.write(str(i[0]) + ' ' + str(i[1]) + ' ' + str(i[2]) + '\n') f.close()
def word_by_word_review(filepath, sheetnum, colnum): # Read product review data from excel file and segment every review review_data = [] for cell in tp.get_excel_data(filepath, sheetnum, colnum, 'data')[0:get_excel_data(filepath, sheetnum, colnum, 'rownum')]: review_data.append(tp.segmentation(cell, 'list')) # Seg every reivew # Read txt file contain stopwords stopwords = tp.get_txt_data('/home/sooda/nlp/Review-Helpfulness-Prediction/data/stopword.txt', 'lines') # Filter stopwords from reviews seg_fil_result = [] for review in review_data: fil = [word for word in review if word not in stopwords and word != ' '] seg_fil_result.append(fil) fil = [] # Return review set as onedimentional list review = list(itertools.chain(*seg_fil_result)) return review
import sklearn from nltk.classify.scikitlearn import SklearnClassifier ##my classifier path filefeature = 'E:/GraduationProject/pythoncode/project/Prediction/main/result/feature_word_ngram.txt' filename = 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/senti_class_word_ngram.pkl' # 1. Load data """ review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1", "data") sentiment_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/review_set.xlsx", "1", "1") """ review = tp.get_excel_data( "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx", 1, 12, "data") sentiment_review = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx", 1, 12) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel( "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx",
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1") sentiment_review = tp.seg_fil_senti_excel("D:/code/sentiment_test/review_set.xlsx", "1", "1") # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
sys.path.append("./Preprocessing module/") import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("../../../Review set/review_set.xlsx", 1, 7, "data") sentiment_review = tp.seg_fil_senti_excel("../../../Review set/review_set.xlsx", 1, 7) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
__author__ = 'anchengwu' #coding=utf-8 import sys sys.path.append("../../../Preprocessing module") import pos_neg_senti_dict_feature as pn import textprocessing as tp # Load dataset review = tp.get_excel_data( "../Machine learning features/seniment review set/pos_review.xlsx", 1, 1, "data") #test single dataset print pn.single_review_sentiment_score( '买过散装的粽子才来买礼盒的,礼盒很大气,比超市买的100多的还要好,配置也不错,肉的素的都有,刚煮了个蛋黄粽子很不错,米好蛋黄也黄很香,老板态度很好,还想买一份~' .decode('utf8')) #test all dataset for i in pn.all_review_sentiment_score(pn.sentence_sentiment_score(review)): print i
#! /usr/bin/env python2.7 #coding=utf-8 import textprocessing as tp review = tp.get_excel_data("../data/review_set.xlsx", 1, 1, "data") review_txt = open('reivew.txt', 'wb+') for r in review: print r review_txt.write(r) review_txt.write('\n') review_txt.close()
sys.path.append("./Preprocessing module/") import textprocessing as tp import cPickle as pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("review_set.xlsx", 1, 7, "data") sentiment_review = tp.seg_fil_senti_excel("review_set.xlsx", 1, 7) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("/home/hadoop/coding/Review set/HTC Z710t_review_2013.6.5.xlsx",1,12, "data") sentiment_review = tp.seg_fil_senti_excel("/home/hadoop/coding/Review set/Meizu MX_review_2013.6.7.xlsx", 1, 12) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/pos_review.xlsx",1,1) negdata = tp.seg_fil_senti_excel("/home/hadoop/coding/Sentiment features/Machine learning features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords)#把文本变成双词搭配的形式 bigram_finder = BigramCollocationFinder.from_words(negWords)
""" import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("D:/code/sentiment_test/review_set.xlsx", "1", "1", "data") sentiment_review = tp.seg_fil_senti_excel( "D:/code/sentiment_test/review_set.xlsx", "1", "1") # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata))
__author__ = 'anchengwu' #coding=utf-8 import sys sys.path.append("../../../Preprocessing module") import pos_neg_senti_dict_feature as pn import textprocessing as tp # Load dataset review = tp.get_excel_data("../Machine learning features/seniment review set/pos_review.xlsx", 1, 1, "data") #test single dataset print pn.single_review_sentiment_score('买过散装的粽子才来买礼盒的,礼盒很大气,比超市买的100多的还要好,配置也不错,肉的素的都有,刚煮了个蛋黄粽子很不错,米好蛋黄也黄很香,老板态度很好,还想买一份~'.decode('utf8')) #test all dataset for i in pn.all_review_sentiment_score(pn.sentence_sentiment_score(review)): print i
# Load sentiment dictionary posdict = tp.get_txt_data("D:/code/sentiment_dictionary/posdict.txt", "lines") negdict = tp.get_txt_data("D:/code/sentiment_dictionary/negdict.txt", "lines") # Load adverbs of degree dictionary mostdict = tp.get_txt_data('D:/code/sentiment_dictionary/most.txt', 'lines') verydict = tp.get_txt_data('D:/code/sentiment_dictionary/very.txt', 'lines') moredict = tp.get_txt_data('D:/code/sentiment_dictionary/more.txt', 'lines') ishdict = tp.get_txt_data('D:/code/sentiment_dictionary/ish.txt', 'lines') insufficientdict = tp.get_txt_data( 'D:/code/sentiment_dictionary/insufficiently.txt', 'lines') inversedict = tp.get_txt_data('D:/code/sentiment_dictionary/inverse.txt', 'lines') # Load dataset review = tp.get_excel_data("D:/code/review_set.xlxs", "1", "1", "data") # 2. Sentiment dictionary analysis basic function # Function of matching adverbs of degree and set weights def match(word, sentiment_value): if word in mostdict: sentiment_value *= 2.0 elif word in verydict: sentiment_value *= 1.5 elif word in moredict: sentiment_value *= 1.25 elif word in ishdict: sentiment_value *= 0.5 elif word in insufficientdict: sentiment_value *= 0.25
'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/SentimentDictionaryFeatures/SentimentDictionary/AdverbsOfDegreeDictionary/ish.txt', 'lines') insufficientdict = tp.get_txt_data( 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/SentimentDictionaryFeatures/SentimentDictionary/AdverbsOfDegreeDictionary/insufficiently.txt', 'lines') inversedict = tp.get_txt_data( 'E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/SentimentDictionaryFeatures/SentimentDictionary/AdverbsOfDegreeDictionary/inverse.txt', 'lines') # Load dataset #review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/HTC.xlsx", 1, 12, "data") #review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/OPPO.xlsx", 1, 12, "data") #review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/MeiZuMX.xlsx", 1, 12, "data") #review = tp.get_excel_data("E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Samsung.xlsx", 1, 12, "data") review = tp.get_excel_data( "E:/GraduationProject/pythoncode/project/Prediction/main/ReviewSet/Motorala.xlsx", 1, 11, "data") #获取excel中第一页的第一列的值 #print review[1] # 2. Sentiment dictionary analysis basic function # Function of matching adverbs of degree and set weights def match(word, sentiment_value): if word in mostdict: sentiment_value *= 2.0 elif word in verydict: sentiment_value *= 1.5 elif word in moredict: sentiment_value *= 1.25
# 1. Load dictionary and dataset # Load sentiment dictionary posdict = tp.get_txt_data("D:/code/sentiment_dictionary/posdict.txt","lines") negdict = tp.get_txt_data("D:/code/sentiment_dictionary/negdict.txt","lines") # Load adverbs of degree dictionary mostdict = tp.get_txt_data('D:/code/sentiment_dictionary/most.txt', 'lines') verydict = tp.get_txt_data('D:/code/sentiment_dictionary/very.txt', 'lines') moredict = tp.get_txt_data('D:/code/sentiment_dictionary/more.txt', 'lines') ishdict = tp.get_txt_data('D:/code/sentiment_dictionary/ish.txt', 'lines') insufficientdict = tp.get_txt_data('D:/code/sentiment_dictionary/insufficiently.txt', 'lines') inversedict = tp.get_txt_data('D:/code/sentiment_dictionary/inverse.txt', 'lines') # Load dataset review = tp.get_excel_data("D:/code/review_set.xlxs", "1", "1", "data") # 2. Sentiment dictionary analysis basic function # Function of matching adverbs of degree and set weights def match(word, sentiment_value): if word in mostdict: sentiment_value *= 2.0 elif word in verydict: sentiment_value *= 1.5 elif word in moredict: sentiment_value *= 1.25 elif word in ishdict: sentiment_value *= 0.5 elif word in insufficientdict: sentiment_value *= 0.25
#! /usr/bin/env python2.7 # coding=utf-8 import textprocessing as tp review = tp.get_excel_data("../data/review_set.xlsx", 1, 1, "data") review_txt = open("reivew.txt", "wb+") for r in review: print r review_txt.write(r) review_txt.write("\n") review_txt.close()
'/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/more.txt', 'lines') ishdict = tp.get_txt_data( '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/ish.txt', 'lines') insufficientdict = tp.get_txt_data( '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/insufficiently.txt', 'lines') inversedict = tp.get_txt_data( '/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_dictionary/inverse.txt', 'lines') # Load dataset #review = tp.get_excel_data("/home/sooda/nlp/Review-Helpfulness-Prediction/data/review_set.xlxs", "1", "1", "data") review = tp.get_excel_data( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/review_set.xlsx", 1, 1, "data") # 2. Sentiment dictionary analysis basic function # Function of matching adverbs of degree and set weights def match(word, sentiment_value): if word in mostdict: sentiment_value *= 2.0 elif word in verydict: sentiment_value *= 1.5 elif word in moredict: sentiment_value *= 1.25 elif word in ishdict: sentiment_value *= 0.5 elif word in insufficientdict:
import textprocessing as tp import pickle import itertools from random import shuffle import nltk from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.probability import FreqDist, ConditionalFreqDist import sklearn # 1. Load data review = tp.get_excel_data("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/review_set.xlsx", 1, 1, "data") sentiment_review = tp.seg_fil_senti_excel("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/review_set.xlsx", 1, 1) # 2. Feature extraction method # Used for transform review to features, so it can calculate sentiment probability by classifier def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)