示例#1
0
    def __init__(self, course_path_info):
        """
        initialize local variables.
        """
        # 定义分词器
        self.sentence_reader = SentenceReader.SentenceReader()
        self.sentence_processor = SentenceProcessor.SenPreprocess()

        # 训练样本文件路径信息,中间结果信息,模型结果信息等
        self.course_path_info = course_path_info

        # 课程名称列表
        self.course_name_list = None

        # 字符串与分词后的对应map
        self.sentence_words_dict = {}

        # 分类目录数据
        self.catalog_code_dict = {}
        # 课程与分类的结果
        self.course_catalogs_good_dict = {}
        self.course_catalogs_bad_dict = {}

        # 未识别的课程归属类别
        self.course_catalog_unknow_list = []

        self.stopwords = [u'学', u'类', u'中国', u'国际', u'国外', u'西方']

        # 获取第二层的分类
        self.snd_level_catalog = []
示例#2
0
    def __init__(self,
                 course_scop_file,
                 course_source_filename='course-20181026.txt'):
        """
        initialize local variables.
        
        该类用于整合所有类在一起,来协同完成试题与知识点的自动关联工作;
        1. 从给定的pdf文件(课程课件),需要转换成txt格式,该文件用于抽取知识点;
        2. 从给定的excel文件(题库导出试题),需要转换成txt格式,该文件用于概念语义网的训练;
        3. 在以上两步骤都完成的情况下,执行自动关联步骤,完成整个工作。
        """
        self.isTest = False
        # 当前文件的文件路径
        self.__curpath = os.path.dirname(os.path.realpath(__file__))
        # 获取文件名称作为课程名称(课程的文件名称即为课程名称)
        self.coursename = None
        # 用于产生知识点的源文件路径,该文件为pdf文件格式(非扫描),或者txt文件格式
        self.courseware_source_docx_filepath = None
        self.courseware_knowledge_txt_filepath = None
        self.courseware_source_txt_filepath = None
        # 用于产生试题的源文件路径,该文件为excel文件格式,或者txt文件格式
        self.questionsourcefilepath = None
        self.questiontargetfilepath = None
        self.questionresultfilepath = None

        # 用于抽取docx文档中的知识树
        self.treefactory = TreeFactory.TreeFactory()

        # 最终的统计结果,
        self.course_score_list = []
        # 已经处理完成的课程,key为schoolcode-coursecode
        self.course_processed_dict = {}
        # 匹配效果不好的课程-试题
        self.course_bad_examquestion = {}
        # 未识别的课程
        self.course_unrecongnized = []
        # 未在范围内的课程
        self.course_over_scope = []
        # 把每个文件归档
        self.course_base_dict = {}

        # 定义句子处理器
        self.sentence_processor = SentenceProcessor.SenPreprocess()
        # 定义试题处理器
        self.exam_processor = exam_question_processor.ExamQuestionProcessor()

        # course 处理器
        self.course_processor = CourseProcessor.CourseProcessor(
            course_source_filename)
        self.exam_processor.setCourseInfo(self.course_processor.course_info)

        # 先初始化课程的范围
        self.__initCourseScope(course_scop_file)
        self.__initSchoolScope()
示例#3
0
    def __init__(self, course_path_info_list):
        """
        initialize local variables.
        """
        # 定义分词器
        self.sentence_reader = SentenceReader.SentenceReader()
        self.sentence_processor = SentenceProcessor.SenPreprocess()
        # 训练样本文件
        self.course_path_info_list = course_path_info_list

        # 知识点集
        self.knowledge = None
示例#4
0
    def __init__(self):
        """
        initialize local variables.
        """
        self.__curpath = os.path.dirname(os.path.realpath(__file__))
        self.course_filepath = None
        self.course_filepath_list = []
        self.prefixwords = []
        self.middlewords = []
        self.suffixwords = []
        self.stopwords = {}
        self.wordreader = WordReader.WordReader()
        self.preprocesser = SentenceProcessor.SenPreprocess()
        self.teacher_processor = None
        self.re_level = self.preprocesser.re_level
        #self.ngram = NGram.NGram()

        self.__initStopwords__()
        self.__initSuffixwords__()
        self.__initPrefixwords__()
        self.__initMiddlewords__()
        self.__init_re()
        pass
示例#5
0
    def outputfile(self, filepath):
        filepath = u'{}.txt'.format(filepath)
        fout = open(filepath, 'w')
        for ci in self.category_list:
            fout.write(ci.toString())
            fout.write('\n')
        fout.close()

if __name__ == "__main__":
    c = Catalog()
    # 专科目录
    filepath = u'D:/奥鹏/运营平台-产品中心/分类目录/专科专业目录-catalog.xlsx'
    c.readZhuankeCatalog(filepath)
    c.outputfile(filepath)

    # 本科目录
    filepath = u'D:/奥鹏/运营平台-产品中心/分类目录/本科专业目录-catalog.xlsx'
    c.readBenkeCatalog(filepath)
    c.outputfile(filepath)

    # 岗位目录
    filepath = u'D:/奥鹏/运营平台-产品中心/分类目录/岗位分类目录-catalog.xlsx'
    sen_pro = SentenceProcessor.SenPreprocess()
    c.sentence_processor = sen_pro
    c.readGangweiCatalog(filepath)
    c.outputfile(filepath)

    pass