def test_neutral(to_stdout=True): """ neutral表示作者的话 :param to_stdout: :return: """ print 'test neutral' sentiment.load('../data/impurity_classifier') result_file = None if not to_stdout: result_file = codecs.open('../data/result.csv', 'w', encoding='gbk', errors='ignore') with codecs.open('../data/clean_neutral.txt', encoding='utf-8') as neutral_file: for line in neutral_file: line = line.strip() prob = sentiment.classify(line) if 0.8 > prob > 0.2: if to_stdout: print (line + ',' + str(prob > 0.5 and 1 or 0) + ',' + str(prob) + cur_linesep).encode('gbk') raw_input('press enter to continue') else: result_file.write(line + ',' + str(prob > 0.5 and 1 or 0) + ',' + str(prob) + cur_linesep) if not to_stdout: result_file.close()
def classify(cases): """ 测试列表中每个句子的 :param cases: :return: """ for case in cases: case = clean_impurity(case) sentiment.load('../data/impurity_classifier') prob = sentiment.classify(case) print (case + ',' + str(prob > 0.5 and 1 or 0) + ',' + str(prob) + cur_linesep).encode('gbk')
def test_sentiment(): print 'test model' sentiment.load('../data/train_impurity_classifier') print 'test_negative' # with codecs.open('../data/test_negative.txt', encoding='utf-8') as negative_file: # for line in negative_file: # if sentiment.classify(line) > 0.1: # print line, raw_input('press enter to continue') print 'test_positive' with codecs.open('../data/test_positive.txt', encoding='utf-8') as positive_file: for line in positive_file: if sentiment.classify(line) < 0.5: print line,
""" Created on Wed Oct 10 10:47:24 2018 @author: Administrator """ import re import pandas as pd import jieba import jieba.analyse import jieba.posseg as pseg from gensim import corpora, models, similarities from snownlp import SnowNLP from snownlp import sentiment sentiment.load( 'D:\\anaconda\\anaconda\\pkgs\\snownlp-0.12.3\\snownlp\\sentiment\\sentiment.marshal' ) jieba.load_userdict('D:\\anaconda\\anaconda\\pkgs\\jieba-0.39\\jieba\\jbj.txt') #发现某类特征 def discover_feature(data, *text): key_list = [] for key in data: #句子根据标点符号分句 keys = re.split('[,~。!?、,. ]', key) for i in keys: for keyword in text: if keyword in i: try: #re匹配关键词之后的字段
0:"nm 给分", # 课程给分情况 1:"rr 课程", # 某位老师开设的课程 2:"nm 类型", # 某门课的类型 3:"rr sst 课程", # 某位老师开设的某种类型的课程 4:"ut 课程", # 某个学院开设了什么课程 5:"ut sst 课程", #某个学院开设的某种类型的课程 6:"rr rr 课程",# 教师a与教师b共同上的课 7:"rr 课程数量",# 某位老师的授课数量 8:"ut sst 给分好课程", #某个学院开设的给分好的课程 9:"sst 给分好的课", # 某个类型里给分不错的课 } # Load the pretrained sentiment classification model data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'sentiment.marshal') sentiment.load(data_path) class QuestionTemplate(): def __init__(self): self.q_template_dict = { 0:self.get_course_rating, 1:self.get_teacher_courses, 2:self.get_course_type, 3:self.get_teacher_type_courses, 4:self.get_school_courses, 5:self.get_school_type_courses, 6:self.get_course_of_2_teacher, 7:self.get_teacher_course_num, 8:self.get_school_good_courses, 9:self.get_type_good_courses, }
from basic.NovelStructure import * from public.BasicStringMethod import * from novel.cluster.NovelCleanModule import * from novel.chapter.ChapterHtmlFilter import * import logging import re debug = False cur_delimiter = str(chr(1)) # 存储文件的分隔符 number_char_list = [ u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9', u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九', u'十', u'百', u'千' ] sentiment.load('data/impurity_classifier') def number_char_format(raw_chapter_title): """ 将章节标题中的连续数字用0代替,便于进行比较 """ fmt_chapter_title = u'' flag = True for char in raw_chapter_title: if char not in number_char_list: fmt_chapter_title += char flag = True else: if flag: fmt_chapter_title += u'0' flag = False