def __init__(self, course_path_info): """ initialize local variables. """ # 定义分词器 self.sentence_reader = SentenceReader.SentenceReader() self.sentence_processor = SentenceProcessor.SenPreprocess() # 训练样本文件路径信息,中间结果信息,模型结果信息等 self.course_path_info = course_path_info # 课程名称列表 self.course_name_list = None # 字符串与分词后的对应map self.sentence_words_dict = {} # 分类目录数据 self.catalog_code_dict = {} # 课程与分类的结果 self.course_catalogs_good_dict = {} self.course_catalogs_bad_dict = {} # 未识别的课程归属类别 self.course_catalog_unknow_list = [] self.stopwords = [u'学', u'类', u'中国', u'国际', u'国外', u'西方'] # 获取第二层的分类 self.snd_level_catalog = []
def __init__(self, course_scop_file, course_source_filename='course-20181026.txt'): """ initialize local variables. 该类用于整合所有类在一起,来协同完成试题与知识点的自动关联工作; 1. 从给定的pdf文件(课程课件),需要转换成txt格式,该文件用于抽取知识点; 2. 从给定的excel文件(题库导出试题),需要转换成txt格式,该文件用于概念语义网的训练; 3. 在以上两步骤都完成的情况下,执行自动关联步骤,完成整个工作。 """ self.isTest = False # 当前文件的文件路径 self.__curpath = os.path.dirname(os.path.realpath(__file__)) # 获取文件名称作为课程名称(课程的文件名称即为课程名称) self.coursename = None # 用于产生知识点的源文件路径,该文件为pdf文件格式(非扫描),或者txt文件格式 self.courseware_source_docx_filepath = None self.courseware_knowledge_txt_filepath = None self.courseware_source_txt_filepath = None # 用于产生试题的源文件路径,该文件为excel文件格式,或者txt文件格式 self.questionsourcefilepath = None self.questiontargetfilepath = None self.questionresultfilepath = None # 用于抽取docx文档中的知识树 self.treefactory = TreeFactory.TreeFactory() # 最终的统计结果, self.course_score_list = [] # 已经处理完成的课程,key为schoolcode-coursecode self.course_processed_dict = {} # 匹配效果不好的课程-试题 self.course_bad_examquestion = {} # 未识别的课程 self.course_unrecongnized = [] # 未在范围内的课程 self.course_over_scope = [] # 把每个文件归档 self.course_base_dict = {} # 定义句子处理器 self.sentence_processor = SentenceProcessor.SenPreprocess() # 定义试题处理器 self.exam_processor = exam_question_processor.ExamQuestionProcessor() # course 处理器 self.course_processor = CourseProcessor.CourseProcessor( course_source_filename) self.exam_processor.setCourseInfo(self.course_processor.course_info) # 先初始化课程的范围 self.__initCourseScope(course_scop_file) self.__initSchoolScope()
def __init__(self, course_path_info_list): """ initialize local variables. """ # 定义分词器 self.sentence_reader = SentenceReader.SentenceReader() self.sentence_processor = SentenceProcessor.SenPreprocess() # 训练样本文件 self.course_path_info_list = course_path_info_list # 知识点集 self.knowledge = None
def __init__(self): """ initialize local variables. """ self.__curpath = os.path.dirname(os.path.realpath(__file__)) self.course_filepath = None self.course_filepath_list = [] self.prefixwords = [] self.middlewords = [] self.suffixwords = [] self.stopwords = {} self.wordreader = WordReader.WordReader() self.preprocesser = SentenceProcessor.SenPreprocess() self.teacher_processor = None self.re_level = self.preprocesser.re_level #self.ngram = NGram.NGram() self.__initStopwords__() self.__initSuffixwords__() self.__initPrefixwords__() self.__initMiddlewords__() self.__init_re() pass
def outputfile(self, filepath): filepath = u'{}.txt'.format(filepath) fout = open(filepath, 'w') for ci in self.category_list: fout.write(ci.toString()) fout.write('\n') fout.close() if __name__ == "__main__": c = Catalog() # 专科目录 filepath = u'D:/奥鹏/运营平台-产品中心/分类目录/专科专业目录-catalog.xlsx' c.readZhuankeCatalog(filepath) c.outputfile(filepath) # 本科目录 filepath = u'D:/奥鹏/运营平台-产品中心/分类目录/本科专业目录-catalog.xlsx' c.readBenkeCatalog(filepath) c.outputfile(filepath) # 岗位目录 filepath = u'D:/奥鹏/运营平台-产品中心/分类目录/岗位分类目录-catalog.xlsx' sen_pro = SentenceProcessor.SenPreprocess() c.sentence_processor = sen_pro c.readGangweiCatalog(filepath) c.outputfile(filepath) pass