def pos_or_neg_reviews2pkl(): ''' convert the neg_reviews and pos_reviews to list[[,...], [,...],...] save the list to pkl ''' neg_list = iohelper.read_file2list('neg') pos_list = iohelper.read_file2list('pos') neg_tk_lst = word_tokenization(neg_list) pos_tk_lst = word_tokenization(pos_list) # segmentation : [[,], [,], ...] iohelper.save_objects2pickle(neg_tk_lst, './Reviews/neg_reviews.pkl') iohelper.save_objects2pickle(pos_tk_lst, './Reviews/pos_reviews.pkl')
def classifier_score(tp, classifier, train_list, test, test_tag): ''' 传入分类器进行分类 Output:pos_precision, pos_recall, accuracy_score ''' starttime = datetime.datetime.now() classifier = SklearnClassifier(classifier) classifier.train(train_list) iohelper.save_objects2pickle(classifier, './Reviews/' + tp + '.pkl') pred = classifier.classify_many(test) # 返回的是结果集的list y_true = [1 if tag == 'pos' else 0 for tag in test_tag] y_pred = [1 if tag == 'pos' else 0 for tag in pred] pos_precision = precision_score(y_true, y_pred) pos_recall = recall_score(y_true, y_pred) endtime = datetime.datetime.now() interval = (endtime - starttime).microseconds interval = interval / 100 return interval, pos_precision, pos_recall, accuracy_score(test_tag, pred)
def word_preprocessing(blog_corpus, subdir): ''' blog_corpus : [["", ""], ["", ""], ["", ""]], new_blog_corpus is the filtered one this function just preprocess the day blog from the saved file(eg:930.txt...) and process and integrate all to one list then to compute every word tfidf and extract the feature word using tfidf lastly save the top word_tfidf into txt and pickle(20160311/wordDict.txt or 20160311/wordDict.pkl) ''' word_dict = {} new_blog_corpus = [] for doc in blog_corpus: tmp = [] for word in doc: if is_word_invalid(word): continue else: tmp.append(word) word_dict[word] = 0 new_blog_corpus.append(tmp) print('word_preprocessing-all microblog number and new %d %d' % (len(blog_corpus), len(new_blog_corpus))) print('word_preprocessing-all word number %d' % len(word_dict)) word_tfidf_dict = {} for doc in new_blog_corpus: score_dict = {} for term in doc: tfidf = tf_idf(term, doc, new_blog_corpus) score_dict[term] = tfidf score_list = sorted(score_dict.iteritems(), key=lambda d: d[1], reverse=True) if (len(score_list) >= 3): for cur in range(3): word_tfidf_dict[score_list[cur][0]] = score_list[cur][1] word_tfidf_list = [] # word_tfidf_list = sorted(word_tfidf_dict.iteritems(), key=lambda d:d[1], reverse = False) for word in word_tfidf_dict: tp = [] tp.append(word) tp.append(word_tfidf_dict[word]) word_tfidf_list.append(tp) print('word_preprocessing-all new word number %d' % len(word_tfidf_list)) iohelper.save_objects2pickle(word_tfidf_list, subdir, 'wordTFDict') print('word_preprocessing-save word_list_tfidf success!')
def word_preprocessing(blog_corpus, subdir): ''' blog_corpus : [["", ""], ["", ""], ["", ""]], new_blog_corpus is the filtered one this function just preprocess the day blog from the saved file(eg:930.txt...) and process and integrate all to one list then to compute every word tfidf and extract the feature word using tfidf lastly save the top word_tfidf into txt and pickle(20160311/wordDict.txt or 20160311/wordDict.pkl) ''' word_dict = {} new_blog_corpus = [] for doc in blog_corpus: tmp = [] for word in doc: if is_word_invalid(word): continue else: tmp.append(word) word_dict[word] = 0 new_blog_corpus.append(tmp) print('word_preprocessing-all microblog number and new %d %d' % (len(blog_corpus), len(new_blog_corpus))) print('word_preprocessing-all word number %d' % len(word_dict)) word_tfidf_dict = {} for doc in new_blog_corpus: score_dict = {} for term in doc: tfidf = tf_idf(term, doc, new_blog_corpus) score_dict[term] = tfidf score_list = sorted(score_dict.iteritems(), key=lambda d:d[1], reverse = True) if (len(score_list) >= 3): for cur in range(3): word_tfidf_dict[score_list[cur][0]] = score_list[cur][1] word_tfidf_list = [] # word_tfidf_list = sorted(word_tfidf_dict.iteritems(), key=lambda d:d[1], reverse = False) for word in word_tfidf_dict: tp = [] tp.append(word) tp.append(word_tfidf_dict[word]) word_tfidf_list.append(tp) print('word_preprocessing-all new word number %d' % len(word_tfidf_list)) iohelper.save_objects2pickle(word_tfidf_list, subdir, 'wordTFDict') print('word_preprocessing-save word_list_tfidf success!')
def main(): pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl') neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl') # 使评论集合随机分布 shuffle(pos_tk_lst) shuffle(neg_tk_lst) posWords = list(itertools.chain(*pos_tk_lst)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*neg_tk_lst)) #同理 # 二选一(前面是所有词,后面是所有词+双词,基于卡方检验疾进行特征选择) # print('1.Word Feature Selection-Chi-sq!') # word_scores = create_word_scores(posWords, negWords) print('2.Word_Plus_Bigram Feature Selection-Chi-sq!') pos_tk_lst = words_plus_bigram(pos_tk_lst) neg_tk_lst = words_plus_bigram(neg_tk_lst) word_scores = create_word_bigram_scores(posWords, negWords) global best_words best_words = find_best_words(word_scores, 1500) iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl') posFeatures = pos_features( pos_tk_lst, best_word_features ) # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']] negFeatures = neg_features(neg_tk_lst, best_word_features) print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' % (len(posFeatures), len(negFeatures))) assert len(posFeatures) == len(negFeatures) print('-------------------------------------------------') Classifier_Type = [ 'Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC' ] # 'SVC' IS CANCELLED (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon() # 10_fold_cross-validation(10折交叉验证) cut_size = int(len(posFeatures) * 0.9) offset_size = len(posFeatures) - cut_size avg_scores = {} avg_precision = {} avg_recall = {} avg_time = {} for tp in Classifier_Type: avg_scores[tp] = 0.0 avg_precision[tp] = 0.0 avg_recall[tp] = 0.0 avg_time[tp] = 0.0 posTmp = [] negTmp = [] # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的) for tp in Classifier_Type: precision = 0.0 recall = 0.0 score = 0.0 time = 0.0 if tp == 'Lexicons': posTmp = posFeatures negTmp = negFeatures posFeatures = pos_tk_lst negFeatures = neg_tk_lst print('Classifier_Type : %s' % (tp)) for k in range(1, 11): test_list = posFeatures[(k - 1) * offset_size:k * offset_size] + negFeatures[ (k - 1) * offset_size:k * offset_size] if k == 1: train_list = posFeatures[k * offset_size:] + negFeatures[ k * offset_size:] elif k == 10: train_list = posFeatures[:( k - 1) * offset_size] + negFeatures[:(k - 1) * offset_size] else: train_list = posFeatures[:(k - 1) * offset_size] + posFeatures[ k * offset_size:] + negFeatures[:( k - 1) * offset_size] + negFeatures[k * offset_size:] if tp == 'Lexicons': test = test_list test_tag = ['pos' for i in range(offset_size)] test_tag.extend(['neg' for i in range(offset_size)]) time, precision, recall, score = sentiment_lexicon_score( pos_lexicon_dict, neg_lexicon_dict, test, test_tag) else: test, test_tag = zip( *test_list ) # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...) if tp == 'LR': time, precision, recall, score = classifier_score( tp, LogisticRegression(), train_list, test, test_tag) elif tp == 'BernoulliNB': time, precision, recall, score = classifier_score( tp, BernoulliNB(), train_list, test, test_tag) elif tp == 'MultinomialNB': time, precision, recall, score = classifier_score( tp, MultinomialNB(), train_list, test, test_tag) elif tp == 'LinearSVC': time, precision, recall, score = classifier_score( tp, LinearSVC(), train_list, test, test_tag) elif tp == 'NuSVC': time, precision, recall, score = classifier_score( tp, NuSVC(probability=True), train_list, test, test_tag) elif tp == 'SVC': precision, recall, score = classifier_score( tp, SVC(gamma=0.001, C=100., kernel='linear', probability=True), train_list, test, test_tag) avg_scores[tp] += score avg_precision[tp] += precision avg_recall[tp] += recall avg_time[tp] += time print( 'The precision recall accuracy score and training time is repectively : %f %f %f %f' % (precision, recall, score, time)) if tp == 'Lexicons': posFeatures = posTmp negFeatures = negTmp posTmp = [] posTmp = [] print('-------------------------------------------------') for tp in Classifier_Type: avg_scores[tp] = avg_scores[tp] / 10 avg_precision[tp] = avg_precision[tp] / 10 avg_recall[tp] = avg_recall[tp] / 10 avg_time[tp] = avg_time[tp] / 10 print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \ (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp])) print("The End!")
def sentiment_index_compute(pos_lexicon_dict, neg_lexicon_dict): ''' 1. select the words to construct dictionary 2. compute sentiment index according to the stock-oriented dictionary ''' opentime1 = st.opentime1 midclose = st.midclose opentime2 = st.opentime2 closetime = st.closetime tick_delta = dt.timedelta(minutes=5) # status = raw_input("Construct lexicons or compute sentiment index? Please input yes(lexicon construction) or no(compute sentiment index)!") status = 'no' isPrint = False # use the flag to print the word score in sentiment computing ml_or_lexicons = raw_input("Compute Sentiment Index:Using machine learning(yes) or lexicons(no)? Please input yes or no!") computeType = raw_input("Using only pos(1) or neg(2) or total(3) or logarithm(4)? Please input 1 or 2 or 3 or 4!") if computeType == '1': computeType = 'pos' elif computeType == '2': computeType = 'neg' elif computeType == '3': computeType = 'total' elif computeType == '4': computeType = 'log' review_list_day = [] date_of_march = ['20160329', '20160331'] date_of_april = ['20160405', '20160406', '20160407', '20160408', '20160411', '20160412', '20160413', '20160414', '20160415', '20160418', '20160419', '20160420', '20160421', '20160425', '20160426', '20160427', '20160429'] date_of_may = ['20160503', '20160504', '20160505', '20160506', '20160509', '20160510', '20160511', '20160512', '20160513', '20160516', '20160517', '20160518', '20160519', '20160520', '20160523', '20160524', '20160525', '20160526', '20160527', '20160530', '20160531'] # june = 10 (compute 10 days sentiment index to make correlation analysis) date_of_june = ['20160601', '20160602', '20160606', '20160613', '20160614', '20160615', '20160620', '20160622', '20160624', '20160628'] # review_list_day.extend(date_of_march) # review_list_day.extend(date_of_april) # review_list_day.extend(date_of_may) review_list_day.extend(date_of_june) # review_list_day = ['20160420'] # just for test for subdir in review_list_day: tick_now = opentime1 blog_corpus = [] sentiment_index = [] print('>>>The date to be handled : {0}'.format(subdir)) while True: if (tick_now >= opentime1 and tick_now <= midclose) or (tick_now >= opentime2 and tick_now <= closetime): hour = tick_now.hour minute = tick_now.minute if hour is 9 and minute is 35: isPrint = True else: isPrint = False fname = str(hour * 100 + minute) blog_five_min_list = iohelper.read_txt2list(fname, subdir) tick_blog_list = word_tokenization(blog_five_min_list) # convert 5-min reviews to blog list lisk this : [[,], [,],...] blog_corpus.extend(tick_blog_list) tick_now += tick_delta # Compute Sentiment Index if status != 'yes': score_tmp = 0 if ml_or_lexicons == 'yes': score_tmp = sentiment_machine_learning(tick_blog_list, computeType, isPrint) else: score_tmp = sentiment_lexicons_compute(pos_lexicon_dict, neg_lexicon_dict, tick_blog_list, computeType, isPrint) sentiment_index.append(score_tmp) elif tick_now > midclose and tick_now < opentime2: tick_now = opentime2 elif tick_now > closetime: break # not necessary if you have provessed it to TF-IDF word dict if status == 'yes': word_preprocessing(blog_corpus, subdir) print('%s : word selected from blog_corpus successfully!' % (subdir)) else: iohelper.save_objects2pickle(sentiment_index, './Sentiment Index/' + subdir + '/saindex_seq.pkl') print('%s : save_objects2pickle successfully! %d' % (subdir, len(sentiment_index))) print('Ending.....')
def main(): pos_tk_lst = iohelper.read_pickle2objects('./Reviews/pos_reviews.pkl') neg_tk_lst = iohelper.read_pickle2objects('./Reviews/neg_reviews.pkl') # 使评论集合随机分布 shuffle(pos_tk_lst) shuffle(neg_tk_lst) posWords = list(itertools.chain(*pos_tk_lst)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*neg_tk_lst)) #同理 # 二选一(前面是所有词,后面是所有词+双词,基于卡方检验疾进行特征选择) # print('1.Word Feature Selection-Chi-sq!') # word_scores = create_word_scores(posWords, negWords) print('2.Word_Plus_Bigram Feature Selection-Chi-sq!') pos_tk_lst = words_plus_bigram(pos_tk_lst) neg_tk_lst = words_plus_bigram(neg_tk_lst) word_scores = create_word_bigram_scores(posWords, negWords) global best_words best_words = find_best_words(word_scores, 1500) iohelper.save_objects2pickle(best_words, './Reviews/best_words.pkl') posFeatures = pos_features(pos_tk_lst, best_word_features) # [[{'':True, '':True,...}, 'pos'], [{'':True, '':True,...}, 'neg']] negFeatures = neg_features(neg_tk_lst, best_word_features) print('POS_FEATURES_LENGTH %d\tNEG_FEATURES_LENGTH %d' % (len(posFeatures), len(negFeatures))) assert len(posFeatures) == len(negFeatures) print ('-------------------------------------------------') Classifier_Type = ['Lexicons', 'LR', 'BernoulliNB', 'MultinomialNB', 'LinearSVC', 'NuSVC'] # 'SVC' IS CANCELLED (pos_lexicon_dict, neg_lexicon_dict) = rp.load_sentiment_lexicon() # 10_fold_cross-validation(10折交叉验证) cut_size = int(len(posFeatures) * 0.9) offset_size = len(posFeatures) - cut_size avg_scores = {} avg_precision = {} avg_recall = {} avg_time = {} for tp in Classifier_Type: avg_scores[tp] = 0.0 avg_precision[tp] = 0.0 avg_recall[tp] = 0.0 avg_time[tp] = 0.0 posTmp = [] negTmp = [] # 比较不同分类器的效果(主要分为基于情感词典的和基于监督式学习的) for tp in Classifier_Type: precision = 0.0 recall = 0.0 score = 0.0 time = 0.0 if tp == 'Lexicons': posTmp = posFeatures negTmp = negFeatures posFeatures = pos_tk_lst negFeatures = neg_tk_lst print ('Classifier_Type : %s' % (tp)) for k in range(1, 11): test_list = posFeatures[(k-1)*offset_size:k*offset_size] + negFeatures[(k-1)*offset_size:k*offset_size] if k == 1: train_list = posFeatures[k*offset_size:] + negFeatures[k*offset_size:] elif k == 10: train_list = posFeatures[:(k-1)*offset_size] + negFeatures[:(k-1)*offset_size] else: train_list = posFeatures[:(k-1)*offset_size] + posFeatures[k*offset_size:] + negFeatures[:(k-1)*offset_size] + negFeatures[k*offset_size:] if tp == 'Lexicons': test = test_list test_tag = ['pos' for i in range(offset_size)] test_tag.extend(['neg' for i in range(offset_size)]) time, precision, recall, score = sentiment_lexicon_score(pos_lexicon_dict, neg_lexicon_dict, test, test_tag) else: test, test_tag = zip(*test_list) # 将内部的元素list(dict和string)分解成两类tuple({}, {}, {},...)和('pos', 'pos', 'neg', ...) if tp == 'LR': time, precision, recall, score = classifier_score(tp, LogisticRegression(), train_list, test, test_tag) elif tp == 'BernoulliNB': time, precision, recall, score = classifier_score(tp, BernoulliNB(), train_list, test, test_tag) elif tp == 'MultinomialNB': time, precision, recall, score = classifier_score(tp, MultinomialNB(), train_list, test, test_tag) elif tp == 'LinearSVC': time, precision, recall, score = classifier_score(tp, LinearSVC(), train_list, test, test_tag) elif tp == 'NuSVC': time, precision, recall, score = classifier_score(tp, NuSVC(probability=True), train_list, test, test_tag) elif tp == 'SVC': precision, recall, score = classifier_score(tp, SVC(gamma=0.001, C=100., kernel='linear', probability=True), train_list, test, test_tag) avg_scores[tp] += score avg_precision[tp] += precision avg_recall[tp] += recall avg_time[tp] += time print ('The precision recall accuracy score and training time is repectively : %f %f %f %f' % (precision, recall, score, time)) if tp == 'Lexicons': posFeatures = posTmp negFeatures = negTmp posTmp = [] posTmp = [] print ('-------------------------------------------------') for tp in Classifier_Type: avg_scores[tp] = avg_scores[tp] / 10 avg_precision[tp] = avg_precision[tp] / 10 avg_recall[tp] = avg_recall[tp] / 10 avg_time[tp] = avg_time[tp] / 10 print ("The %s\'s average precision recall accuracy score and training time is repectively : %.2f %.2f %.2f %.2f" % \ (tp, avg_precision[tp], avg_recall[tp], avg_scores[tp], avg_time[tp])) print ("The End!")
def sentiment_index_compute(pos_lexicon_dict, neg_lexicon_dict): ''' 1. select the words to construct dictionary 2. compute sentiment index according to the stock-oriented dictionary ''' opentime1 = st.opentime1 midclose = st.midclose opentime2 = st.opentime2 closetime = st.closetime tick_delta = dt.timedelta(minutes=5) # status = raw_input("Construct lexicons or compute sentiment index? Please input yes(lexicon construction) or no(compute sentiment index)!") status = 'no' isPrint = False # use the flag to print the word score in sentiment computing ml_or_lexicons = raw_input( "Compute Sentiment Index:Using machine learning(yes) or lexicons(no)? Please input yes or no!" ) computeType = raw_input( "Using only pos(1) or neg(2) or total(3) or logarithm(4)? Please input 1 or 2 or 3 or 4!" ) if computeType == '1': computeType = 'pos' elif computeType == '2': computeType = 'neg' elif computeType == '3': computeType = 'total' elif computeType == '4': computeType = 'log' review_list_day = [] date_of_march = ['20160329', '20160331'] date_of_april = [ '20160405', '20160406', '20160407', '20160408', '20160411', '20160412', '20160413', '20160414', '20160415', '20160418', '20160419', '20160420', '20160421', '20160425', '20160426', '20160427', '20160429' ] date_of_may = [ '20160503', '20160504', '20160505', '20160506', '20160509', '20160510', '20160511', '20160512', '20160513', '20160516', '20160517', '20160518', '20160519', '20160520', '20160523', '20160524', '20160525', '20160526', '20160527', '20160530', '20160531' ] # june = 10 (compute 10 days sentiment index to make correlation analysis) date_of_june = [ '20160601', '20160602', '20160606', '20160613', '20160614', '20160615', '20160620', '20160622', '20160624', '20160628' ] # review_list_day.extend(date_of_march) # review_list_day.extend(date_of_april) # review_list_day.extend(date_of_may) review_list_day.extend(date_of_june) # review_list_day = ['20160420'] # just for test for subdir in review_list_day: tick_now = opentime1 blog_corpus = [] sentiment_index = [] print('>>>The date to be handled : {0}'.format(subdir)) while True: if (tick_now >= opentime1 and tick_now <= midclose) or (tick_now >= opentime2 and tick_now <= closetime): hour = tick_now.hour minute = tick_now.minute if hour is 9 and minute is 35: isPrint = True else: isPrint = False fname = str(hour * 100 + minute) blog_five_min_list = iohelper.read_txt2list(fname, subdir) tick_blog_list = word_tokenization( blog_five_min_list ) # convert 5-min reviews to blog list lisk this : [[,], [,],...] blog_corpus.extend(tick_blog_list) tick_now += tick_delta # Compute Sentiment Index if status != 'yes': score_tmp = 0 if ml_or_lexicons == 'yes': score_tmp = sentiment_machine_learning( tick_blog_list, computeType, isPrint) else: score_tmp = sentiment_lexicons_compute( pos_lexicon_dict, neg_lexicon_dict, tick_blog_list, computeType, isPrint) sentiment_index.append(score_tmp) elif tick_now > midclose and tick_now < opentime2: tick_now = opentime2 elif tick_now > closetime: break # not necessary if you have provessed it to TF-IDF word dict if status == 'yes': word_preprocessing(blog_corpus, subdir) print('%s : word selected from blog_corpus successfully!' % (subdir)) else: iohelper.save_objects2pickle( sentiment_index, './Sentiment Index/' + subdir + '/saindex_seq.pkl') print('%s : save_objects2pickle successfully! %d' % (subdir, len(sentiment_index))) print('Ending.....')