def readRegularKnowledgeList(self):
        if not FilePath.fileExist(self.course_path_info_list[0].
                                  courseware_knowledge_txt_filepath):
            return
        #words = open('./../data/79037-002_knowledge.txt', 'r')
        # zhongjicaiwukuaiji-auto-knowledge
        f = open(
            self.course_path_info_list[0].courseware_knowledge_txt_filepath,
            'r')
        ids_lines = f.readlines()
        index = 0
        for line in ids_lines:
            index += 1
            if index == 1:
                continue
            line = line.strip('\n')
            line_k = line.split(' ')
            if len(line_k) < 2:
                continue

            line_k_code = line_k[0]
            line_k_word = line_k[1]
            line_k_word_key = line_k_word.replace(u"'", ".")
            #line_k_confidence = line_k[2]
            line_k_confidence = 100

            if self.knowledge.__contains__(line_k_word_key):
                continue
            words = self.sentence.splitSentenceCanRepeat(line_k_word)
            words = self.preprocessor.enlargeVipWords(words, line_k_word)
            tup = (words, line_k_confidence, line_k_code)
            self.knowledge[line_k_word_key] = tup
            tup = (line_k_word_key, line_k_confidence, line_k_code)
            self.knowledgeByCode[line_k_code] = tup
예제 #2
0
    def train(self):

        # 先检查模型是否存在,如果存在,直接加载
        if FilePath.fileExist(self.course_path_info.vector_model_bin_filepath):
            #self.model_loaded = Word2Vec.load_word2vec_format(self.model_file, binary=True)
            self.model_loaded = KeyedVectors.load_word2vec_format(
                self.course_path_info.vector_model_bin_filepath, binary=True)
            # 输出词典
            #self.output_dict(self.model_loaded.wv.index2word)
            # 生成语料
            self.generate_train_file()
            return

        # 生成语料
        self.generate_train_file()

        # 加载语料
        #sentences = word2vec.Text8Corpus(self.train_output_result_file)
        sentences = LineSentence(
            self.course_path_info.vector_corpus_txt_filepath)
        # 训练skip-gram模型,默认window=5
        # 第一个参数是训练语料,第二个参数是小于该数的单词会被剔除,默认值为5, 第三个参数是神经网络的隐藏层单元数,默认为100
        # 注意:min_count = 1,就是所有词,如果设置大的话,会过滤掉小于的词
        print '正在训练模型...'
        model = Word2Vec(sentences, size=500, min_count=1, iter=5000)
        #model.wv.save(self.model_file)
        model.wv.save_word2vec_format(
            self.course_path_info.vector_model_bin_filepath, binary=True)
        self.model_loaded = model
예제 #3
0
 def loadProcessedFile(self, filepath):
     if not FilePath.fileExist(filepath):
         return
     f_input = open(filepath, 'r')
     for fname in f_input:
         fname = fname.strip('\n')
         if len(fname) == 0:
             continue
         self.processed_file.append(fname)
예제 #4
0
    def readFile(self, filepath=None):
        """
        read the excel data
        python操作excel主要用到xlrd和xlwt这两个库,即xlrd是读excel,xlwt是写excel的库。
        可从这里下载https://pypi.python.org/pypi。下面分别记录python读和写excel.
        :param filepath:the excel full path 
        :return: ture if read file ok, false otherwise 
        """
        result_list = []

        # 如果filepath 是空的话,就先看看self.filepath 是否为空
        if filepath is None and self.filepath is None:
            print '请设置读取的文件名称.'
            return result_list

        if filepath is None:
            filepath = self.filepath

        # 检查文件是否存在
        if not FilePath.fileExist(filepath):
            return result_list

        # 打开文件
        workbook = xlrd.open_workbook(filepath)
        # 获取所有sheet
        #print workbook.sheet_names()  # [u'sheet1', u'sheet2']
        #sheet2_name = workbook.sheet_names()[1]
        local_sheet_scope_indexes = self.getSheetScope(workbook)


        totalcount = 0
        for index in local_sheet_scope_indexes:
            sheet = workbook.sheet_by_index(index)
            rowindex = self.start_row_index
            local_sheet_columns_indexes = self.getSheetColumnScope(sheet)
            # 如果列的范围与预期不一致,就跳过该sheet
            if self.column_scope_names is not None and len(local_sheet_columns_indexes) != len(self.column_scope_names):
                print '该sheet没有需要的数据'
                continue
            while rowindex < sheet.nrows:
                row = sheet.row_values(rowindex)
                rowindex = rowindex + 1
                try:
                    one_row = self.addOneRow(row, local_sheet_columns_indexes)
                    result_list.append(one_row)
                    totalcount = totalcount + 1
                    if totalcount % 100 == 0:
                        print '已经读取:{0}行'.format(totalcount)
                except Exception:
                    print '数据异常行数:' + str(rowindex)
                    print '读取数据异常:' + Exception.message


        print '共读取:{0}行'.format(totalcount)

        return result_list
예제 #5
0
 def loadProcessedCourse(self, rootpath):
     output_mid_filepath = '{}/statistics-mid.txt'.format(rootpath)
     if not FilePath.fileExist(output_mid_filepath):
         return
     fout = open(output_mid_filepath, 'r')
     lines = fout.readlines()
     for course_base_code in lines:
         course_base_code = course_base_code.strip('\n')
         self.course_processed_dict[course_base_code] = course_base_code
     fout.close()
예제 #6
0
 def loadBaseCourse(self, base_course_file):
     if not FilePath.fileExist(base_course_file):
         return
     f_input = open(base_course_file, 'r')
     for line in f_input:
         line = line.strip('\n')
         cb = CourseInfomation.CourseBase()
         cb.initByString(line)
         self.course_base_list.append(cb)
         self.current_base_course = cb
예제 #7
0
    def readText(self):
        self.content_rows = []
        if self.filepath is None:
            return
        if not FilePath.fileExist(self.filepath):
            return

        f_input = open(self.filepath)
        for row in f_input:
            self.content_rows.append(row)

        return self.content_rows
 def loadProcessedCourse(self, rootpath):
     output_mid_filepath = '{}/statistics-mid.txt'.format(rootpath)
     if not FilePath.fileExist(output_mid_filepath):
         return
     fout = open(output_mid_filepath, 'r')
     lines = fout.readlines()
     for one_course_str in lines:
         course_score = CourseInfomation.CourseScore()
         course_score.initByString(one_course_str)
         key = '{}-{}'.format(course_score.school_code,
                              course_score.course_code)
         self.course_processed_dict[key] = course_score
         self.course_score_list.append(course_score)
     fout.close()
예제 #9
0
    def generate_train_file(self):

        # 加载训练文本,训练文本有2部分组成,一部分是课件,一部分是试题

        # 检查语料文件是否已经生成, 如果已经生成,则不用再生成
        #if  FilePath.fileExist(self.course_path_info.vector_corpus_txt_filepath):
        #    return
        # 打开结果文件
        f_out = open(self.course_path_info_list[0].vector_corpus_txt_filepath,
                     'w')
        for course_path_info in self.course_path_info_list:

            # 第一步先加载课件
            if course_path_info.courseware_source_txt_filepath:
                for c_line in self.sentence_reader.splitSentence(
                        course_path_info.courseware_source_txt_filepath):
                    f_out.write(' '.join(c_line))
                    f_out.write('\n')

            # 第二步加载试题
            if course_path_info.examquestion_source_txt_filepath and FilePath.fileExist(
                    course_path_info.examquestion_source_txt_filepath):
                question = open(
                    course_path_info.examquestion_source_txt_filepath, 'r')
                ids_lines = question.readlines()
                for line in ids_lines:
                    # line = "物权的分类:从设立的角度对他物权再做分类,可把其分为()。,用益物权和担保物权"
                    line = line.strip('\n')
                    index = line.find('::')
                    if index < 0:
                        continue
                    k = line[0:index]
                    q = line[index + 2:]
                    q_words = self.sentence_reader.splitOneSentence(q)
                    q_words = self.sentence_processor.enlargeVipWords(
                        q_words, q)
                    f_out.write(' '.join(q_words))
                    f_out.write('\n')

        # 第三步抽取的知识点也作为训练样本
        if self.knowledge:
            for k_key in self.knowledge:
                k_tup = self.knowledge[k_key]
                f_out.write(' '.join(k_tup[0]))
                f_out.write('\n')

        f_out.close()
예제 #10
0
    def subjectSimilarity(self):
        """
        判断2个短句的相似性
        :return: 
        """
        if not FilePath.fileExist(
                self.course_path_info.examquestion_source_txt_filepath):
            return
        question = open(self.course_path_info.examquestion_source_txt_filepath,
                        'r')
        ids_lines = question.readlines()
        qindex = 0
        question_map = {}
        processed_q_map = {}
        for line in ids_lines:
            line = line.strip('\n')
            index = line.index('::')
            k = line[0:index]
            q = line[index + 2:]

            qindex = qindex + 1
            q_words = self.sentence.splitOneSentence(q)
            if len(q_words) == 0:
                continue
            # 然后再遍历知识点

            find_same_flag = False
            for old_q in question_map.keys():
                old_q_list = question_map.get(old_q)
                old_q_words = self.sentence.splitOneSentence(old_q)

                score = self.doc_vec.pred_similarity(q_words, old_q_words)
                if score > 0.95:
                    old_q_list.append(q)
                    find_same_flag = True
                    break
            if find_same_flag == False:
                question_map[q] = []

        # 过滤下,对于list是空的,不要管
        self.result_map = {}
        for q_key in question_map.keys():
            q_list = question_map.get(q_key)
            if len(q_list) > 0:
                self.result_map[q_key] = q_list
def generateKnowledgeCypher(course_path_info):
    """
    generate cypher 
    :return: 
    """
    cypherlist = []
    # 检查文件
    if not FilePath.fileExist(course_path_info.courseware_knowledge_txt_filepath):
        return cypherlist
    cypherlist.append("CREATE CONSTRAINT ON (c:Knowledge) ASSERT c.code IS UNIQUE;")
    cypherlist.append("CREATE CONSTRAINT ON (c:Question) ASSERT c.code IS UNIQUE;")
    cypherlist.append("create index on:Question(databaseid);")
    # 读取知识点文件,到一个字典文件中
    # 知识点内部的关系,暂时仅建立父子之间的直接关系
    k_dict = {}
    f_k = open(course_path_info.courseware_knowledge_txt_filepath, 'r')
    for k in f_k:
        k = k.strip('\n')
        k_arr = k.split(' ')
        if len(k_arr) < 2:
            continue
        k_dict[k_arr[0]] = k_arr[1]

    # 建立父子关系
    for k_code, k_name in k_dict.items():
        # 处理k_code,寻找上一级code
        k_code_arr = k_code.split('.')
        k_code_parent = '.'.join(k_code_arr[:-1])
        # 不存在父节点,就不用创建关系
        if not k_dict.__contains__(k_code_parent):
            continue
        # 存在父节点,创建父子关系
        k_name_parent = k_dict.get(k_code_parent)
        k_ns_child = "MERGE (k_child:Knowledge {{code:'{0}'}}) on create set k_child.name='{1}'".format(k_code, k_name)
        k_ns_parent = "MERGE (k_parent:Knowledge {{code:'{0}'}}) on create set k_parent.name='{1}'".format(k_code_parent, k_name_parent)
        k_ns_parent_child = "MERGE (k_parent)-[:CHILD]->(k_child);"

        com = k_ns_child + ' ' + k_ns_parent + ' ' + k_ns_parent_child
        cypherlist.append(com)

    return cypherlist
예제 #12
0
    def generate_train_file(self):
        self.sentence_words_dict = {}
        # 加载训练文本,训练文本有2部分组成,一部分是课件,一部分是试题

        # 检查语料文件是否已经生成, 如果已经生成,则不用再生成
        #if  FilePath.fileExist(self.course_path_info.vector_corpus_txt_filepath):
        #    return
        catalog_corpus_file = u'D:/pythonproject/open-neo4j-service/data/course-base/本科专业目录-catalog.corpus.txt'
        catalog_xls_file = u'D:/pythonproject/open-neo4j-service/data/course-base/本科专业目录-catalog.xlsx.txt'
        # 打开结果文件
        f_out = open(catalog_corpus_file, 'w')

        # 第一步先加载分类目录

        if FilePath.fileExist(catalog_xls_file):
            fin = open(catalog_xls_file, 'r')  # 以读的方式打开文件
            # 以第二层为判断点
            # 合并第二层以下的点为一行
            level_snd_list = []
            write_line = ''
            first_code = ''
            first_name = ''
            index = 0
            for line in fin:
                arr = line.split()
                code_line = arr[0]
                name_line = arr[1]
                code_section_list = code_line.split('.')
                if index == 0:
                    first_code = code_line
                    first_name = name_line

                if len(code_section_list) == 1:
                    name_line1 = self.preprocessSent(name_line)
                    #c_line_words = self.sentence_reader.splitSentenceCanRepeat(name_line1)
                    c_line_words = self.sentence_reader.splitOneSentence(
                        name_line1)
                    c_line_words = self.postWordList(c_line_words)
                    #section_name = ' '.join(c_line_words)
                    f_out.write(' '.join(c_line_words))
                    f_out.write('\n')
                    self.catalog_code_dict[name_line] = (code_line, name_line,
                                                         c_line_words)
                elif len(code_section_list) == 2:

                    if len(level_snd_list) > 0:
                        write_line = ' '.join(level_snd_list)
                        write_line1 = self.preprocessSent(write_line)
                        #c_line_words = self.sentence_reader.splitSentenceCanRepeat(write_line1)
                        c_line_words = self.sentence_reader.splitOneSentence(
                            write_line1)
                        c_line_words = self.postWordList(c_line_words)
                        section_name = ' '.join(c_line_words)
                        f_out.write(section_name)
                        f_out.write('\n')
                        self.catalog_code_dict[first_name] = (first_code,
                                                              first_name,
                                                              c_line_words)
                        # 第二层的数据
                        self.snd_level_catalog.append(
                            (first_code, first_name, section_name,
                             c_line_words))
                    # 重置列表为空列表
                    level_snd_list = []
                    level_snd_list.append(name_line)
                    first_code = code_line
                    first_name = name_line
                else:
                    level_snd_list.append(name_line)

                index += 1
            # 最后一项
            write_line = ' '.join(level_snd_list)
            write_line1 = self.preprocessSent(write_line)
            #c_line_words = self.sentence_reader.splitSentenceCanRepeat(write_line1)
            c_line_words = self.sentence_reader.splitOneSentence(write_line1)
            c_line_words = self.postWordList(c_line_words)
            section_name = ' '.join(c_line_words)
            f_out.write(section_name)
            f_out.write('\n')
            self.catalog_code_dict[first_name] = (first_code, first_name,
                                                  c_line_words)

            # 第二层的数据
            self.snd_level_catalog.append(
                (first_code, first_name, section_name, c_line_words))

        f_out.close()
예제 #13
0
    def predication1(self):
        self.course_score = CourseInfomation.CourseScore()
        self.course_score.initCourse(self.course_path_info.course)
        # match(n)-[:NEXT]-(m) where n.name in ['典型','金本位制','指','金币','本位'] return n,m
        if not FilePath.fileExist(
                self.course_path_info.examquestion_source_txt_filepath):
            return
        question = open(self.course_path_info.examquestion_source_txt_filepath,
                        'r')
        ids_lines = question.readlines()
        qindex = 0
        question_knowledge_map = {}

        for line in ids_lines:
            #line = "物权的分类:从设立的角度对他物权再做分类,可把其分为()。,用益物权和担保物权"
            line = line.strip('\n')
            index = line.index('::')
            k = line[0:index]
            q = line[index + 2:]
            question_knowledge_map[q] = k
            qindex = qindex + 1
            q_words = self.sentence.splitSentenceCanRepeat(q)
            # 从q中找重点词, 并放大重点词
            q_words = self.preprocessor.enlargeVipWords(q_words, q)
            if len(q_words) == 0:
                continue
            # 然后再遍历知识点
            index = 0
            res_list = []
            for k_key in self.knowledge.keys():
                k_tup = self.knowledge.get(k_key)
                k_words = k_tup[0]
                if len(k_words) == 0:
                    continue
                score = self.doc_vec.pred_similarity(q_words, k_words)
                res = ResultInfo.ResultInfo(index, score,
                                            k_tup[2] + ' ' + k_key)
                res_list.append(res)
                index += 1
            # 对列表按score降序排列
            res_list.sort(cmp=None, key=lambda x: x.score, reverse=True)
            # 取分值最高的几个,超过1%,的舍去,或者再限定具体数量,比如3个

            # 统计得分的情况
            self.computeScore(res_list)

            # 获取上级 知识点
            #reslist = self.getParentKnowledge(reslist)
            # 格式化输出
            reslist, wordlist = self.formatOutput(res_list, k)
            # 统计正确率
            if len(reslist) > 0:
                ns = '问题{0}:'.format(qindex) + q
                self.outputcontentlist.append(ns + '\n')
                ns = '电脑标识知识点:' + ';'.join(wordlist)
                self.outputcontentlist.append(ns + '\n')
                ns = '知识点评估指标:' + ';'.join(reslist)
                self.outputcontentlist.append(ns + '\n')
                #print '老师标识知识点:' + k
                ns = '老师标识知识点:'
                self.outputcontentlist.append(ns + '\n')
                self.outputcontentlist.append('\n')
                #ns = '电脑标识是否正确:'
                #self.outputcontentlist.append(ns)

        # 计算正确率
        # 题目总数
        self.course_score.compute()

        ns = '试题总数:{}'.format(self.course_score.score_scope_total)
        self.outputcontentlist.append(ns + '\n')
        print ns

        ns = '比较靠谱数(60分以上):{}  ,比较靠谱占比:{}%'.format(
            self.course_score.score_scope_more60_count,
            round(self.course_score.score_scope_more60_rate * 100, 2))
        self.outputcontentlist.append(ns + '\n')
        print ns
        ns = '基本靠谱数(50-60分):{}  ,基本靠谱占比:{}%'.format(
            self.course_score.score_scope_between5060_count,
            round(self.course_score.score_scope_between5060_rate * 100, 2))
        self.outputcontentlist.append(ns + '\n')
        print ns
        ns = '不太靠谱数(40-50分):{}  ,不太靠谱占比:{}%'.format(
            self.course_score.score_scope_between4050_count,
            round(self.course_score.score_scope_between4050_rate * 100, 2))
        self.outputcontentlist.append(ns + '\n')
        print ns
        ns = '不靠谱数(40分以下):{}  ,不靠谱占比:{}%'.format(
            self.course_score.score_scope_less40_count,
            round(self.course_score.score_scope_less40_rate * 100, 2))
        self.outputcontentlist.append(ns + '\n')
        print ns