예제 #1
0
def getSchoolWord(code, k):
    #schoolWord {school_id1:{word1:word1出现的次数,word2:word2出现的次数,...},school_id2:{word1:word1出现的次数,word2:word2出现的次数,...}}
    sql = "SELECT name FROM `discipline_new` where code=%s"
    #teacher - school{teacher_id1: school1, teacher_id2: school2}
    teacherSchool=pickle.load(open(root+'/teacherSchool', 'rb'))
    result = dbs.getDics(sql, (code,))
    #name学科名字
    name = result[0]['name']
    path = root+'/' + name + '-' + code
    #file对应的是计算tfidf后的文件
    file = open(path + "/" + code + "_fenci_tdidf.txt", 'r', encoding="utf8")
    #paper paper和teacher的对应关系
    paper = pickle.load(open(root+'/paperTeacher', 'rb'))
    list = file.readlines()
    schoolWord = {}
    for line in list:
        #temp 字典 {id:paperid,fenci:paper中的关键词}
        temp = eval(line)
        paper_id = temp['id']
        if paper_id in paper:
            teacher_id = paper[paper_id]["author_id"]
            school_id=teacherSchool[teacher_id]
            if school_id not in schoolWord :
                schoolWord[school_id] = {}
            words = temp['fenci'].split(' ')
            for w in words:
                if w in schoolWord [school_id]:
                    schoolWord[school_id][w] += 1
                else:
                    schoolWord[school_id][w] = 1

    pickle.dump(schoolWord, open(root+'/' + code + '/k' + str(k) + '/schoolWord', 'wb'))
def getPaperAndTecher():
    '''
    paperTeacher {paper_id:{"author_id":r["author_id"],"name":r["name"]}}
    paperTeacher文件:字典,文章id为key,value是教师信息
    teacherpaper {author_id:[]}
    teacherpaper文件:字典,老师id为值,value是文章信息
    :return:
    '''
    #获得论文id,论文author_id 作者name a是论文 b是老师名字

    print('paper-teacher and teacher-paper..')
    sql = "SELECT a.paper_id,a.teacher_id,b.`name` FROM `teacher_paper` a ,es_teacher b where a.teacher_id=b.ID"
    paper = {}
    teacher = {}
    result = dbs.getDics(sql)

    for r in result:
        paper[r["paper_id"]] = {
            "author_id": r["teacher_id"],
            "name": r["name"]
        }

        #以老师名字为key,论文名字为value
        if r["teacher_id"] in teacher:
            teacher[r["teacher_id"]].append(r["paper_id"])
        else:
            teacher[r["teacher_id"]] = [r["paper_id"]]
    pickle.dump(paper, open(root + '/paperTeacher', 'wb'))
    pickle.dump(teacher, open(root + '/teacherPaper', 'wb'))
def getInstitutionName():
    '''
    写入院系信息,{institution_id: {'ID':, 'SCHOOL_ID':, 'SCHOOL_NAME':, 'NAME':}, ...}
    :return:
    '''
    print('getInstitutionName..')
    print('得到院系信息')
    sql = "SELECT a.*,b.total from es_institution a join institution_rank b on a.ID=b.institution_id"
    result = dbs.getDics(sql)
    dic = {}
    for r in result:
        dic[r['ID']] = r
    pickle.dump(dic, open(root + '/InstitutionName', 'wb'))
def getWordPaper(code, k):
    '''
    读入某个学科某个主题数下的topic2word,输出word2topic和topic2word
    word2topic {'催化剂': {0: 0.104, 1: 0.0, 2: 0.0, 3: 0.0,}
    word2topic {word1:{topic1:p1,topic2:p2},word2:{}}
    topicToWord  {topic1:{word1:p1,word2:p2},topic2:{word1:p1,word2:p2}}
    :param code:学科代码
    :param k: 主题数
    :return:
    '''
    print('getWordPaper..')
    sql = "SELECT name FROM `discipline_new` where code=%s"
    result = dbs.getDics(sql, (code, ))
    name = result[0]['name']
    # p = data/学科名-学科代码/k0828
    p = root + '/' + name + '-' + code + '/k' + str(k)
    #file:
    # 0: {'无人机': 0.046, '直升机': 0.027, '纤维素': 0.024,
    # 1: {'电机': 0.061, '控制器': 0.031, '控制策略
    file = open(p + '/' + code + "_topic.txt", 'r', encoding="utf8")
    #读取主题及其关键词
    list = file.readlines()
    # 词2主题 每个词,如果这个词在某个主题下出现,就将它的概率记录下来
    # {'催化剂': {0: 0.104, 1: 0.0, 2: 0.0, 3: 0.0,}
    wordToTopic = {}
    #与读入的文件结构一样
    topicToWord = {}

    for topic_id, line in enumerate(list):
        #这里的:用来分隔主题号和关键词字典
        index = line.find(":")
        #将关键词字典转化为字典,words{'催化剂': 0.104, '神经网络': 0.063,}
        words = eval(line[index + 1:])
        topicToWord[topic_id] = words
        #words是字典,这样遍历,w指代字典中的key
        for w in words:
            if w in wordToTopic:
                wordToTopic[w][topic_id] = words[w]
            else:
                wordToTopic[w] = {}
                wordToTopic[w][topic_id] = words[w]
    path = root + '/' + code + '/k' + str(k)
    if not os.path.exists(path):
        os.makedirs(path)
    pickle.dump(topicToWord, open(path + '/topicToWord', 'wb'))
    pickle.dump(wordToTopic, open(path + '/wordToTopic', 'wb'))
def getTeacherAndSchool():
    '''
    1.school-teacher {school_id1:[teacher_id1,teacher_id2,...],school_id2:{teacher_id3....}}
    2.teacher-school {teacher_id1:school1,teacher_id2:school2}
    :return:
    '''
    print('getTeacherAndSchool')
    sql = "SELECT a.ID,a.SCHOOL_ID from es_teacher a"
    result = dbs.getDics(sql)
    teacher = {}
    school = {}
    for r in result:
        if r["SCHOOL_ID"] in school:
            school[r["SCHOOL_ID"]].append(r["ID"])
        else:
            school[r["SCHOOL_ID"]] = [r["ID"]]
        teacher[r["ID"]] = r["SCHOOL_ID"]
    pickle.dump(teacher, open(root + '/teacherSchool', 'wb'))
    pickle.dump(school, open(root + '/schoolTeacher', 'wb'))
def getPaperTopic(code, k):
    '''
    给teacher_topic.txt 加上文章id
    {paper_id1:{topic1:p1,topic2:p2},paper_id2:{},paper_id2:{topic2:p2}}
    格式 {243084: {13: 0.92521083}, 242970: {25: 0.92443347, 15: 0.04058274}}
    :param code:
    :param k:
    :return:
    '''
    print('getPaperTopic')
    sql = "SELECT name FROM `discipline_new` where code=%s"
    result = dbs.getDics(sql, (code, ))
    name = result[0]['name']
    p = root + '/' + name + '-' + code + '/k' + str(k)
    #p: G:\w_project\data/农业工程-0828/k36
    #file 读取文章主题文件
    file = open(p + '/' + code + "_teacher_topic.txt", 'r', encoding="utf8")
    # paperId 计算tfidf后的文件
    paperId = open(root + '/' + name + '-' + code + "/" + code +
                   "_fenci_tdidf.txt",
                   'r',
                   encoding="utf8")
    #ids里面是各个文档id
    ids = []
    for line in paperId.readlines():
        item = eval(line)
        ids.append(item['id'])
    #每个文章对应的主题及id
    # {243084: {13: 0.92521083}, 242970: {25: 0.92443347, 15: 0.04058274},
    paperToic = {}

    for paper_index, line in enumerate(file.readlines()):
        # line:[(5, 0.75776845), (7, 0.17293692), (11, 0.04333982)]
        temp = eval(line)
        #ids[paper_index] 是文章id
        paperToic[ids[paper_index]] = {}
        for t in temp:
            #paperToic[ids[paper_index]] 是个字典 t[0] 是key t[1]是value
            paperToic[ids[paper_index]][t[0]] = t[1]
    path = root + '/' + code + '/k' + str(k)
    pickle.dump(paperToic, open(path + '/paperToic', 'wb'))
def getTeacherName():
    '''
    把教师信息和权值写入一个文件,格式是{id1(即teacher表中id):{教师信息+权重},id2:{信息},...}
    {149104:{'id': 149104, 'name': '潘正华', 'position': None, 'title': None, 'school': '江南大学', 'institution': '理学院',
        #  'theme': None, 'eduexp': None, 'email': None, 'pic': None,
        #  'homepage': 'http://cksp.eol.cn/tutor_detail.php?id=11396', 'school_id': 17397, 'age': 0, 'field_id': None,
        #  'total': 0.75}
    :return:
    '''
    #total是字段名
    print('getTeacherName..')
    sql = "SELECT a.*,b.total from es_teacher a join teacher_rank b on a.ID=b.teacher_id"
    result = dbs.getDics(sql)
    dic = {}
    for r in result:
        #r:   # {'id': 149104, 'name': '潘正华', 'position': None, 'title': None, 'school': '江南大学', 'institution': '理学院',
        #  'theme': None, 'eduexp': None, 'email': None, 'pic': None,
        #  'homepage': 'http://cksp.eol.cn/tutor_detail.php?id=11396', 'school_id': 17397, 'age': 0, 'field_id': None,
        #  'total': 0.75}
        dic[r['ID']] = r
    pickle.dump(dic, open(root + '/teacherName', 'wb'))
예제 #8
0
def getTeacherWord(code,k):
    '''
    TeacherWord {teacher_id1:{word1:num1,word2:num2},teacher_id2:{word1:num1,...},...}
    :param code:学科代码
    :param k: 主题数
    :return:
    '''
    sql="SELECT name FROM `discipline_new` where code=%s"
    result = dbs.getDics(sql,(code,))
    #result :[{'name': '农业工程'}]
    #name是code对应的学科名字
    name=result[0]['name']
    path=root+'/' + name + '-' + code
    #file对应的是计算tfidf后的文件
    file= open(path+ "/"+code+"_fenci_tdidf.txt", 'r', encoding="utf8")
    #paper是paper-teacher的对应关系
    paper = pickle.load(open(root+'/paperTeacher', 'rb'))
    list=file.readlines()
    teacherWord={}
    for line in list:
        #temp 字典 {id:paperid,fenci:paper中的关键词}
        temp=eval(line)
        paper_id=temp['id']
        # 70511 论文有30万篇,能找到作者的只有70000篇,所以很多对应不上,而学长代码是在一张表中,必然对应的上,所以要修改代码,判断该paper能否找到作者。
        if paper_id in paper:
            teacher_id=paper[paper_id]["author_id"]
            if teacher_id not in teacherWord:
                teacherWord[teacher_id]={}
            words=temp['fenci'].split(' ')
            for w in words:
                if w in teacherWord[teacher_id]:
                    teacherWord[teacher_id][w]+=1
                else:
                    teacherWord[teacher_id][w]= 1

    pickle.dump(teacherWord, open(root+'/'+ code+'/k'+str(k)+'/teacherWord', 'wb'))