Пример #1
0
def zhuanli_duplicate():
    '''
    专利去重
    :return:
    '''
    s_sql = "SELECT * FROM `pss_zhuanli_copy` GROUP BY TIVIEW, INVIEW"
    save_list = dbs.getDics(s_sql)

    save_dict = dict()
    print(len(save_list))
    for s in save_list:
        save_dict[str(s['id'])] = "1"

    s_sql = "SELECT * FROM `pss_zhuanli_copy`"
    delete_list = dbs.getDics(s_sql)

    id_list = []

    for d in delete_list:
        if not save_dict.get(str(d['id'])):
            id_list.append(d['id'])

    print(len(id_list))
    d_sql = '''
            DELETE FROM `pss_zhuanli_copy`
            WHERE id =%s
    '''
    print(dbs.exe_many(d_sql, id_list))
Пример #2
0
def institution_email():

    file = open(DIR + "\\dicts\\institution_email.txt", "a+", encoding="utf-8")
    email_lines = file.read().split('\n')
    info_sql = "select id, info from teacherdata_info where id >= 40146"
    info = dbs.getDics(info_sql)
    list_ = []
    # 生成机构邮箱词典 判定方法为:重复出现的邮箱暂定为机构邮箱
    for item in info:
        if not item["info"]:
            continue
        info_text = item["info"]
        info_text = info_text.replace("[at]", "@")
        info_text = info_text.replace(" ", "")
        info_text = info_text.replace("\n", "")
        email_text = [i[0] for i in re.findall(reEmail, info_text)]

        if email_text:
            l2 = sorted(set(email_text), key=email_text.index)  # 处理同一个页面重复出现的邮箱地址
            list_.extend(l2)
            print('#'*20)
        else:
            print('*'*20)

    list_1 = []
    l3 = sorted(set(list_))
    for item in l3:
        n = list_.count(item)
        if n > 2 and item not in email_lines:
            list_1.append(item)
    print(list_1)
    print(len(list_1))
    file.write("\n".join(list_1))
    file.close()
    pass
Пример #3
0
def get_paper_data():
    begin = 0
    end = 6000000
    step = 10000
    s_sql = '''
        SELECT t1.id, t1.`name`, t1.abstract, t2.discipline_code 
        FROM paper_clean1 t1, teacher_dis_code t2 
        WHERE t2.id = t1.author_id AND t2.discipline_code like '08%%' and t1.id > %s and t1.id <= %s;
    '''
    u_sql = "INSERT paper_data(id, title, abstract, discipline) VALUES(%s, %s, %s, %s);"
    s = 0
    while begin + step <= end:
        print(s_sql % (str(begin), str(begin + step)))
        data_list = dbs.getDics(s_sql % (str(begin), str(begin + step)))

        u_list = []
        for data in data_list:
            if not (data["name"] and data["abstract"]):
                continue

            u_list.append((data["id"], data["name"], data["abstract"],
                           data["discipline_code"]))
            if len(u_list) == 5000:
                print(dbs.exe_many(u_sql, u_list))
                u_list = []
                s += 5000

        ll = len(u_list)
        s += ll
        print(dbs.exe_many(u_sql, u_list))
        begin += step

    print(s)
Пример #4
0
def get_edu_exp():
    select_sql = "select id, info_clear from teacher_eduexp where type = 0"

    teacher_list = dbs.getDics(select_sql)
    print(len(teacher_list))
    ta = TextAttribute()

    num = 0
    update_list = []
    for teacher in teacher_list:
        if teacher["info_clear"] is None or teacher["info_clear"] == "":
            continue
        # print(teacher["id"])
        ta.set_text(teacher["info_clear"])

        ta.seg_sentence("\n")
        ta.compute_gravity()
        t, edu_items = ta.get_edu_items()
        if edu_items:
            print(teacher["id"])
            print(t, edu_items)
            num += 1
            update_list.append(("\n".join(edu_items), t, teacher["id"]))
            continue
        # ta.get_edu_long_item()

    print(num)
    print(len(update_list))
    update_sql = "update teacher_eduexp set edu_exp=%s, type=%s where id = %s"
    print(dbs.exe_many(update_sql, update_list))
Пример #5
0
def data_clean():
    """
    将javascript的链接转换为正常
    :return:
    """
    data_list = dbs.getDics(
        "SELECT * FROM `eds_985teacher` WHERE link like '%javascript%' AND school = '中南大学';"
    )
    print(len(data_list))
    u_list = []
    for data in data_list:
        id = data['id']
        '''javascript:window.open('/blog/content2?name='+encodeURI('周雄伟'))'''
        link = data['link']
        if link != "":
            p_tuple = re.findall(r"open\('(.+?)'\+encodeURI\('(.+?)'\)\)",
                                 link)[0]

            link = p_tuple[0] + pa.quote(p_tuple[1])
            # print(pa.urljoin(data['institution_url'], link))
            link = pa.urljoin(data['institution_url'], link)
            print(link)
            u_list.append((link, id))

    print(len(u_list))
    u_sql = "UPDATE eds_985teacher SET all_link=%s WHERE id = %s"
    print(dbs.exe_many(u_sql, u_list))
Пример #6
0
def x_t():
    import jieba.posseg as pseg

    s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'"
    data_list = dbs.getDics(s_sql)

    x_word_dict = dict()
    for data in data_list:
        title = data['title']
        abstract = data['abstract']

        word_list = pseg.cut(title + "\n" + abstract, HMM=True)
        for w, f in word_list:
            if f == "x":
                key = w + "--SPLIT--" + f
                c = x_word_dict.get(key, 0)
                x_word_dict[key] = c + 1
                pass

    fw = open('.\\stopword_base.txt', 'w', encoding='utf8')

    save_list = ["A", "B", "C", "D", "E"]

    for k, v in x_word_dict.items():
        word = k.split('--SPLIT--')[0]
        flag = k.split('--SPLIT--')[1]
        f = v
Пример #7
0
Файл: tt.py Проект: haha8x/eds
def f():
    dis_name = open('dis_name.txt', 'r', encoding='utf8').read().split('\n')
    dis_name = set(dis_name)
    dis_school_dict = dict()
    s_sql = "SELECT school FROM `discipline_school` WHERE `name` = '%s' AND school_id IS NOT NULL"
    for name in dis_name:
        print(name)
        dis_school = [i['school'] for i in dbs.getDics(s_sql % name)]
        dis_school_dict[name] = dis_school

    re_li = list()
    csvreader = csv.reader(open('jishulingyuyuxueke.csv', 'r'))
    lingyu_xueke = [tuple(node) for node in csvreader]

    for node in lingyu_xueke:
        di = dict()
        d_list = node[1].split('-')
        for d in d_list:
            s_list = dis_school_dict[d]
            for s in s_list:
                if di.get(s):
                    di[s] += "-" + d
                else:
                    di[s] = d
        ll = list()
        for key, value in di.items():
            item = key + '(' + '/'.join(value.split('-')) + ')'
            ll.append(item)
        re_li.append(','.join(ll))
    print(re_li)
    print(len(re_li))

    print('\n'.join(re_li))
Пример #8
0
def test():

    info_sql = "select id, info from teacherdata_info where id >= 40146"
    info = dbs.getDics(info_sql)
    list_ = []
    # 生成机构邮箱词典 判定方法为:重复出现的邮箱暂定为机构邮箱
    for item in info:
        if not item["info"]:
            continue
        info_text = item["info"]
        info_text = info_text.replace("[at]", "@")
        info_text = info_text.replace(" ", "")
        info_text = info_text.replace("\n", "")
        email_text = [i[0] for i in re.findall(reEmail, info_text)]

        if email_text:
            l2 = sorted(set(email_text), key=email_text.index)  # 处理同一个页面重复出现的邮箱地址
            list_.extend(l2)
            print('#' * 20)
        else:
            print('*' * 20)

    list_1 = []
    l3 = sorted(set(list_))
    for item in l3:
        n = list_.count(item)
        if n > 3:
            list_1.append(item + "," + str(n))
    print(list_1)
    print(len(list_1))
    file = open("1.csv", "w", encoding="utf8")
    file.write("\n".join(list_1))
    file.close()
    pass
    pass
Пример #9
0
def show_data():
    """
    :return:
    """
    # s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != '' and id < 1000"
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    for teacher in teacher_list:
        exp_list = teacher["exp_clear"].split('\n')
        ne_list = eval(teacher["ne"])
        if not ne_list:
            continue
        flag = 0
        for i in range(0, len(ne_list)):
            if not ne_list[i]:
                continue
            degree = ne_list[i].get("degree", "")
            org = ne_list[i].get("org", "")
            if degree == "毕业" and org == "":
                flag = 1
                print(ne_list[i])
        if flag == 1:
            num += 1

    print(num)
Пример #10
0
def get_email():
    info_sql = "select id, info, homepage from teacherdata_info where id >= 40146 and email=''"
    info = dbs.getDics(info_sql)

    ins_dict = open(DIR + "\\dicts\\institution_email.txt", "r", encoding="utf-8").readlines()
    ins_dict = [ins.strip('\n') for ins in ins_dict]

    update_list = []
    for item in info:
        if not item["info"]:
            continue
        if re.search(r'cksp\.eol\.cn', item["homepage"]) is not None:
            info_dict = eval(item["info"])
            try:
                email_text = [i[0] for i in re.findall(reEmail, info_dict["E-mail"])]
            except:
                continue
            pass
        else:
            info_text = item["info"]
            info_text = info_text.replace("[at]", "@")
            info_text = info_text.replace(" ", "")
            info_text = info_text.replace("\n", "")
            email_text = [i[0] for i in re.findall(reEmail, info_text)]

        if email_text:
            list_email = sorted(set(email_text), key=email_text.index)  # 去除相同邮箱地址
            list_email = [item for item in list_email if item not in ins_dict]  # 去除机构邮箱地址
            if len(list_email) > 0:
                print(";".join(list_email))
                update_list.append((";".join(list_email), item["id"]))
    print(len(update_list))
    update_sql = "update teacherdata_info set email=%s where id = %s"
    print(dbs.exe_many(update_sql, update_list))
    pass
Пример #11
0
def clear_1():
    """
    # 去除有工作描述的句子,没有工作经历的去除生日年份
    # 去除与出版信息有关的句子
    :return:
    """
    s_sql = "select id, edu_exp from teacher_eduexp where type = 2"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))

    re_title = r'讲师|教授|指导|博士生导师|研究生导师|硕士生导师|从事|研究员|所长|院长|博导|硕导'
    re_job = r'任教|任|从事'
    re_publish = r'《|》|出版|学报|杂志'
    re_birth = r'[1-2][9,0][0-9]{2}生|出生|生于'

    update_list = []
    num = 0
    for teacher in teacher_list:
        lines = teacher["edu_exp"].split('\n')
        new_lines = []
        for line in lines:
            line = re.sub(r'教 授', "教授", line)
            line = re.sub(r'讲 师', "讲师", line)
            line = re.sub(r'学 士', "学士", line)
            line = re.sub(r'博 士', "博士", line)
            line = re.sub(r'硕 士', "硕士", line)
            if re.findall(re_birth, line):
                continue
            if re.findall(re_job, line):
                continue
            if re.findall(re_title, line):
                if len(re.findall(r'学位|学士|硕士|博士|进修|硕博', line)) == 0:
                    continue
                elif (len(re.findall(r'博士', line)) == 1 and len(re.findall(r'博士生导师', line)) == 1) or (len(re.findall(r'硕士', line)) == 1 and len(re.findall(r'硕士生导师', line)) == 1):
                    continue
            if re.findall(re_publish, line) and len(re.findall(r'博士|硕士|学士|本科|研究生|访问学者|博士后|获|毕业|进修|学习|学位|直博|访问|MSW|硕博', line)) == 0:
                continue
            if re.findall(r'博士|硕士|学士|本科|研究生|访问学者|博士后|获|毕业|进修|学习|学位|直博|访问|MSW|硕博', line):
                new_lines.append(line)
            pass
        t1 = '\n'.join(lines)
        t2 = '\n'.join(new_lines)
        if t1 != t2:
            print(teacher["id"])
            print(t1)
            print('-' * 10)
            print(t2)
            print('-' * 10)
            num += 1
            if num % 1000 == 0:
                print()
        update_list.append(('\n'.join(new_lines), 2, teacher["id"]))

    print(num)
    print(len(update_list))
    update_sql = "update teacher_eduexp set exp_clear = %s, clear=%s where id = %s"
Пример #12
0
def date2date():
    """
    日期格式统一
    2017年3月-2017年7月
    [0-9\-年\.月-~~\—―/]{4,}
    :return:
    """
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    for teacher in teacher_list:
        exp_list = teacher["exp_clear"].split('\n')
        try:
            ne_list = eval(teacher["ne"])
        except:
            print(teacher["id"])
        if not ne_list:
            continue
        flag = 0
        for i in range(0, len(ne_list)):
            if not ne_list[i]:
                continue
            date = ne_list[i].get("date", "")
            if re.findall(r'-|~|~|——|至', date):
                date = re.sub(r'-|~|~|——|至', '-', date)

                flag = 1

            if re.findall(r'年', date):
                date = re.sub(r'年', '.', date)
                date = re.sub(r'月', '', date)
                date = re.sub(r'\.;', ';', date)
                date = date.strip('.')
                flag = 1

            if re.findall(r'\.-', date):
                date = re.sub(r'\.-', '-', date)
                flag = 1
            ne_list[i]["date"] = date

        if flag == 1:
            num += 1
            print(ne_list)
            update_list.append((str(ne_list), teacher["id"]))

    print("-" * 10)
    print(num)
    print(len(update_list))
    u_sql = "update teacher_eduexp set ne = %s where id = %s"
    print(dbs.exe_many(u_sql, update_list))
    pass
Пример #13
0
def get_abroad():
    school_dict = eval(open(".\\dicts\\school2en_dict.txt", "r", encoding='utf8').read())

    abroad = {}.fromkeys(open(".\\dicts\\in.txt", "r", encoding='utf8').read().split('\n'))
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    o_list = []
    for teacher in teacher_list:
        try:
            ne_list = eval(teacher["ne"])
        except:
            print(teacher["id"])
            continue
        if not ne_list:
            continue
        flag = 0
        for i in range(0, len(ne_list)):

            ne = ne_list[i]

            org_list = ne.get("org", "").split(';')
            for o in org_list:
                o = re.sub('大学.+?系', '大学', o)
                o = re.sub('大学.+?学院', '大学', o)
                if school_dict.get(o, "") == "" and re.findall('国|日本|澳大利亚|州|芬兰|瑞典|挪威|冰岛|丹麦|爱沙尼亚'
                                                               '|拉脱维亚|立陶宛|白俄罗斯|俄罗斯|乌克兰|摩尔多瓦|波兰|捷克'
                                                               '|斯洛伐克|匈牙利|德国|奥地利|瑞士|列支敦士登|英国|爱尔兰|荷兰'
                                                               '|比利时|卢森堡|法国|摩纳哥|罗马尼亚|保加利亚|塞尔维亚|马其顿'
                                                               '|阿尔巴尼亚|希腊|斯洛文尼亚|克罗地亚|波斯尼亚和墨塞哥维那'
                                                               '|意大利|梵蒂冈|圣马力诺|马耳他|西班牙|葡萄牙|安道尔', o) \
                        and not re.findall('中国|首都|华东|华北|华南|华西|华中|西北|西南|东北|东南|北京|天津|上海|重庆|河北'
                                           '|山西|辽宁|吉林|黑龙江|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|广东|海南|四川'
                                           '|贵州|云南|陕西|甘肃|青海|台湾|内蒙|广西|西藏|宁夏|新疆|香港|澳门|石家庄|沈阳'
                                           '|哈尔滨|杭州|福州|济南|广州|武汉|成都|昆明|兰州|台北|南宁|银川|太原|长春|南京|合肥'
                                           '|南昌|郑州|长沙|海口|贵阳|西安|西宁|呼和浩特|拉萨|乌鲁木齐', o)\
                        or abroad.get(o, "") != "":
                    flag = 1
                    break
            if flag == 1:
                break
        if flag == 1:
            num += 1
            update_list.append((str(flag), teacher["id"]))

    print(num)

    print(len(update_list))
    u_sql = "update teacher_edu_description set abroad = %s where id = %s"
    print(dbs.exe_many(u_sql, update_list))
Пример #14
0
def paper_seg():
    import re
    import jieba
    jieba.load_userdict('.\\dicts\\user_dict_1.txt')
    term_dict = {}.fromkeys(
        open('.\\dicts\\term.txt', 'r', encoding='utf8').read().split('\n'),
        'ok')

    # s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'"
    s_sql = "SELECT id, title, abstract FROM paper_data WHERE id in (632158,632690,633512,634259,634504,644862,645697,647947,651835,696197,697667,698942,701609,701882,702953,703279,719978,778166,781868,782636,785997,787785,788662,788852,789241,868144,869130,869391,869971,870955,871869,873702,877593,878509,878761,1057022,1069453,1070486,1085083,1085705,1086615,1088270,1096989,1328935,1329754,1330950,1333472,1336010,1376006,1379811,1382522,1384484,1519331,1538164,1591831,1912371,1913089,1913270,1915550,1916681,1921026,1921611,1922188,1923005,1923339,1923498,1924501,1925167,1934011,1935109,1942329,1947322,1951950,1955110,1978880,1983142,1986760,1987129,1989538,1990521,1991737,1995202,2023057,2030104,2032255,2032605,2039093,2043059,2045712,2051244,2064811,2090132,2090809,2091235,2102585,2103888)"
    data_list = dbs.getDics(s_sql)

    u_sql = "UPDATE paper_data SET word_seg=%s WHERE id=%s"
    u_list = []

    fields_dict = dict()

    for data in data_list:
        title = data['title']
        abstract = data['abstract']

        word_list = jieba.cut(title + "\n" + abstract, HMM=False, cut_all=True)
        seg_dict = dict()

        for word in word_list:
            if term_dict.get(word, "") != "":
                c = seg_dict.get(word, 0)
                seg_dict[word] = c + 1

                cc = fields_dict.get(word, 0)
                fields_dict[word] = cc + 1

        # u_list.append((str(seg_dict), data['id']))
        print(abstract)
        print((str(seg_dict), data['id']))
        print("*" * 10)

    for w, k in fields_dict.items():
        print("%s,%s\n" % (w, str(k)))

        # l = len(u_list)
        # if l == 10000:
        #     print(dbs.exe_many(u_sql, u_list))
        #     u_list = []

    # print(len(u_list))
    # print(dbs.exe_many(u_sql, u_list))

    pass
Пример #15
0
def get_name():
    s_sql = "SELECT id, name, info FROM `teacherdata_info` LIMIT 1000;"
    teacher_data = dbs.getDics(s_sql)
    sum = 0
    for teacher in teacher_data:
        info = teacher['info']
        name = ex_name(info)
        if name == teacher['name']:
            sum += 1
            print(True)
        else:
            print(False)

    print(len(teacher_data))
    print(sum)
Пример #16
0
def clear_3():
    """
    清洗ne字段:补全date,部分没有date的,实体识别将date识别为organization
    如果date为空
    :return:
    """
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != ''"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    for teacher in teacher_list:
        ne_list = eval(teacher["ne"])
        flag = 0
        for i in range(0, len(ne_list)):
            date = ne_list[i].get("date", "")
            org = ne_list[i].get("org", "")
            new_date_list = []
            new_org_list = []
            if date == "" and org != "" and re.findall(r'[1-2][9,0][0-9]{2}', org):
                org_list = org.split(';')
                for org in org_list:
                    the_org_list = re.findall(r'[\u4E00-\u9FA5]+', org)
                    if not the_org_list:
                        new_org_list.append(org)
                        continue
                    new_org = the_org_list[0]
                    if re.findall(r'^年', new_org):
                        new_org = re.sub(r'^年', "", new_org)
                    new_date = re.sub(new_org, "", org)
                    new_date_list.append(new_date)
                    new_org_list.append(new_org)
            if new_date_list:
                ne_list[i]["date"] = ";".join(new_date_list)
                ne_list[i]["org"] = ";".join(new_org_list)
                flag = 1
        if flag == 1:
            print("-" * 10)
            print(teacher["id"])
            print(eval(teacher["ne"]))
            print("-" * 3)
            print(ne_list)
            print("-" * 10)
            update_list.append((str(ne_list), teacher["id"]))

    u_sql = "update teacher_eduexp set ne = %s where id = %s"
    print(len(update_list))
Пример #17
0
def get_tf_df():
    '''
    tf:词在领域内出现的频次
    df:词在领域内出现的文档数
    :return:
    '''
    stop_word_base = {}.fromkeys(open('.\\dicts\\stopword_base.txt', 'r', encoding='utf8').read().split('\n'))

    import jieba.posseg as pseg

    code_list = ["0801", "0802", "0803", "0804", "0805", "0806", "0807", "0808", "0809", "0810",
                 "0811", "0812", "0813", "0814", "0815", "0816", "0817", "0818", "0819", "0820",
                 "0821", "0822", "0823", "0824", "0825", "0826", "0827", "0828", "0829", "0830",
                 "0831", "0832"]

    code_num = dict()

    for code in code_list:
        s_sql = "SELECT * FROM paper_data WHERE discipline=%s" % code
        data_list = dbs.getDics(s_sql)
        tf_dict = dict()
        df_dict = dict()
        flag = ["an", "j", "n", "nz", "vn"]
        for data in data_list:
            word_dict = dict()
            seg_list = pseg.cut(data["abstract"], HMM=True)
            for w in seg_list:
                if stop_word_base.get(w.word, "") == "" and w.flag in flag:

                    c = tf_dict.get(w.word, 0) + 1
                    tf_dict[w.word] = c
                    word_dict[w.word] = 1

            for k, v in word_dict.items():
                df = df_dict.get(k, 0) + v
                df_dict[k] = df

        fw = open(".\\test\\tf_df\\%s.csv" % code, "w", encoding="utf8")
        fw.write("term,tf,df\n")
        for k, v in tf_dict.items():
            df = df_dict.get(k, 0)
            fw.write("%s,%s,%s\n" % (k, str(v), str(df)))
        fw.close()
        code_num[code] = len(data_list)

    print(code_num)
Пример #18
0
def get_institution():
    """
    5.学院评价-institution
    值				权重
    一级重点学科	    2
    二级重点学科	    1
    无				0
    :return:
    """
    sql_initial = '''
            UPDATE teacher_dis_code
            SET dis_rank = 0
        '''
    print(dbs.exe_sql(sql_initial))

    s_sql = '''
        SELECT teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code`
        FROM `teacher_dis_code`,`discipline_school` 
        WHERE teacher_dis_code.discipline_code != '' 
        AND teacher_dis_code.discipline_code IS NOT NULL 
        AND teacher_dis_code.discipline_code = discipline_school.root
        AND teacher_dis_code.school = discipline_school.school
        GROUP BY teacher_dis_code.school, teacher_dis_code.discipline_code, discipline_school.`code`;
    '''
    data_list = dbs.getDics(s_sql)
    u_list = []
    for data in data_list:
        if len(data['code']) == 4:
            u_list.append((5, data['school'], data['discipline_code']))
        elif len(data['code']) == 6:
            u_list.append((1, data['school'], data['discipline_code']))

    print(len(u_list))
    u_sql = '''
        UPDATE teacher_dis_code
        SET dis_rank = dis_rank + %s
        WHERE school=%s AND discipline_code=%s
    '''
    print(dbs.exe_many(u_sql, u_list))

    sql_initial_rank = '''
        UPDATE teacher_rank, teacher_dis_code
        SET teacher_rank.institution = teacher_dis_code.dis_rank
        WHERE teacher_rank.teacher_id = teacher_dis_code.id;
    '''
    print(dbs.exe_sql(sql_initial_rank))
Пример #19
0
def get_title():
    title_dict = ["副教授", "助理教授", "教授", "讲师", "助教", "副研究员", "助理研究员", "研究员", "高级工程师", "高级实验师", "高工", "工程师", "实验师"]

    extractor = Extractor()
    result_list = []

    select_sql = "SELECT id, name, info, all_link FROM `eds_985teacher` WHERE school = '清华大学';"
    teacher_list = dbs.getDics(select_sql)
    print(len(teacher_list))

    for teacher in teacher_list:
        if re.search(r'cksp\.eol\.cn', teacher["all_link"]) is not None:
            info_dict = eval(teacher["info"])
            try:
                extractor.set_text(info_dict["个人简介"])
            except:
                person_info = teacher['info']
        else:
            try:
                info = eval(teacher['info'])
                person_info = "".join(list(info.values()))
            except:
                person_info = teacher['info']
            if person_info is None:
                continue
            extractor.set_text(person_info)
        re_list = [r'职称|职务', r'个人简介|个人简历', teacher["name"]]
        # 匹配模式
        size = [50, 200, 200]
        extractor.sub()
        extractor.cut_blocks(re_list, size)
        index = 0
        title = ""
        while index < len(title_dict):
            if title_dict[index] in extractor.text:
                if title_dict[index] == "副教授" and len(re.findall(r'教授', extractor.text)) > 1:
                    title = "教授"
                    break
                title = title_dict[index]
                break
            index += 1

        if title != "":
            # result_list.append((title, teacher["id"]))

            print((teacher["id"], title))
Пример #20
0
def t():
    import jieba.posseg as pseg
    import xlwt

    s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'"
    data_list = dbs.getDics(s_sql)

    flag_list = [
        'vd', 'g', 'h', 'f', 's', 'r', 'nt', 'm', 'ad', 'ns', 'zg', 'z', 'ag',
        'q', 'yg', 'd', 'u', 'ul', 'j', 'a', 'y', 'ug', 'vg', 't', 'p', 'mq',
        'uj', 'o', 'uv', 'k', 'c', 'nz', 'nrfg', 'tg', 'i'
    ]

    word_dict = dict()
    x_word_dict = dict()
    for data in data_list:
        title = data['title']
        abstract = data['abstract']

        word_list = pseg.cut(title + "\n" + abstract, HMM=True)
        for w, f in word_list:
            if f in flag_list:
                key = w + "--SPLIT--" + f
                c = word_dict.get(key, 0)
                word_dict[key] = c + 1
            elif f == "x":
                key = w + "--SPLIT--" + f
                c = x_word_dict.get(key, 0)
                x_word_dict[key] = c + 1
                pass

    wbk = xlwt.Workbook(encoding='utf-8')
    sheet = wbk.add_sheet('sheet1')
    row = 0
    for k, v in word_dict.items():
        word = k.split('--SPLIT--')[0]
        flag = k.split('--SPLIT--')[1]
        f = v
        sheet.write(row, 0, word)
        sheet.write(row, 1, flag)
        sheet.write(row, 2, f)
        row += 1

    wbk.save('.\\test\\con_stop.xls')
    print(row)
Пример #21
0
Файл: test.py Проект: haha8x/eds
def zhuanli_duplicate():
    s_sql = "SELECT * FROM `pss_zhuanli` GROUP BY TIVIEW, INVIEW, APD"
    info_list = dbs.getDics(s_sql)

    update_list = []
    for info in info_list:
        update_list.append(
            (info['TIVIEW'], info['INVIEW'], info['APD'], info['id']))

    for i in update_list:
        print(i)
    print(len(update_list))
    d_sql = '''
            DELETE FROM `pss_zhuanli_copy`
            WHERE TIVIEW=%s
            AND INVIEW=%s
            AND APD=%s
            AND id !=%s
    '''
    print(dbs.exe_many(d_sql, update_list))
Пример #22
0
def ne2sentence():
    from algorithm.li.extract.templates.ne2sentence_template import sentence_template
    ne_name = ["org", "date", "degree", "country", "state_or_province", "major", "discipline_category", "graduate"]
    s_t = sentence_template
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ok = 1"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    for teacher in teacher_list:
        try:
            ne_list = eval(teacher["ne"])
        except:
            print(teacher["id"])
            continue
        if not ne_list:
            continue
        str_list = []
        for i in range(0, len(ne_list)):

            ne = ne_list[i]

            t_l = []
            for n in ne_name:
                t = ne_list[i].get(n, "")
                if t != "":
                    t_l.append(n)
            s = ",".join(t_l)
            if s != "" and s not in s_t:
                s_t.append(s)

            if ne.get("degree", "") == "" or ne.get("degree", "") in ["学士", "硕士", "博士"]:
                r = nn(s, ne)
                if r != "":
                    str_list.append(r)

        update_list.append(("\n".join(str_list), teacher["id"]))

    print(len(update_list))
    u_sql = "update teacher set eduexp = %s where id = %s"
    print(dbs.exe_many(u_sql, update_list))
Пример #23
0
def mentor_extract():
    s_sql = "SELECT * FROM `pss_zhuanli_copy`;"
    info_list = dbs.getDics(s_sql)
    mentor_dict = {}.fromkeys(
        open('.\\qinghua\\mentor_list.txt', 'r',
             encoding='utf-8').read().split('\n'))
    print(mentor_dict)
    print("*" * 10)
    update_list = []
    for item in info_list:
        author_list = item['INVIEW'].split(';')
        mentor_list = []
        for author in author_list:
            if mentor_dict.get(author, "") != "":
                mentor_list.append(author)
        print(author_list, mentor_list)
        update_list.append((";".join(mentor_list), item['id']))

    u_sql = "UPDATE `pss_zhuanli_copy` SET MENTOR = %s WHERE id=%s"
    print(len(update_list))
    print(dbs.exe_many(u_sql, update_list))
Пример #24
0
def clear_7():
    """
    :return:
    """
    # s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != '' and id < 1000"
    s_sql = "select id, ne, exp_clear from teacher_eduexp where ne != '' and type=1 and ok = 0 limit 1000"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    update_list = []
    num = 0
    for teacher in teacher_list:
        exp_list = teacher["exp_clear"].split('\n')
        ne_list = eval(teacher["ne"])
        if not ne_list:
            continue
        flag = 1
        print(teacher["id"])
        print(teacher["exp_clear"])
        for i in range(0, len(ne_list)):
            if not ne_list[i]:
                continue
            degree_list = ne_list[i].get("degree", "").split(';')
            org_list = ne_list[i].get("org", "").split(';')
            date_list = ne_list[i].get("date", "").split(';')
            degree_list = [i for i in degree_list if i != ""]
            org_list = [i for i in org_list if i != ""]
            date_list = [i for i in date_list if i != ""]

            org = ne_list[i].get("org", "")
            d = re.findall('[0-9\-年\.月-~~—/]{4,}', org)
            if len(d) == 1:
                exp = re.sub(r' ', '', exp_list[i])
                da = re.findall(r'[0-9\-年\.月-~~\—―/]{4,}', exp)
                if len(da) > 1:
                    # org = re.sub(r'[0-9\-年\.月-~~——/]{4,}', '', org)
                    # ne_list[i]["org"] = org
                    # ne_list[i]["date"] = da[0]
                    print(da)
                    print(exp)
                    flag = 1
Пример #25
0
def f():

    select_sql = "SELECT id, name, html FROM `eds_985teacher` WHERE school = '清华大学';"
    teacher_list = dbs.getDics(select_sql)
    print(len(teacher_list))
    update_list = []
    for teacher in teacher_list:
        if teacher["html"] is None or teacher["html"] == "":
            continue
        html = teacher["html"]
        html = re.sub(reTRIM_closing.format("style"), "", html)
        html = re.sub(reTRIM_closing.format("style".upper()), "", html)
        html = re.sub(reTRIM_closing.format("script"), "", html)
        html = re.sub(reTRIM_closing.format("script".upper()), "", html)
        html = re.sub(reTRIM_closing.format("head"), "", html)
        html = re.sub(reTRIM_closing.format("head".upper()), "", html)
        html = re.sub(reCOMM, "", html)
        for re_tag in inline_tags:
            html = re.sub(re_tag, "", html)

        name = re.sub('(', '(', teacher["name"])
        name = re.sub(')', ')', name)
        name = re.sub('\(.*?\)', '', name)
        text_list = cut_blocks(html, re_list=[r'个人简介|个人简历', name])
        if not text_list:
            continue
        text = "\n".join(text_list)
        if text:
            print(teacher["id"])
            update_list.append((text, teacher["id"]))
        if len(update_list) == 1000:
            update_sql = "update eds_985teacher set info=%s where id=%s"
            print("插入……1000")
            print(dbs.exe_many(update_sql, update_list))
            update_list = []
    if update_list:
        update_sql = "update eds_985teacher set info=%s where id=%s"
        print("插入……%s" % len(update_list))
        print(dbs.exe_many(update_sql, update_list))
    pass
Пример #26
0
def clear_2():
    """
    保留只包含学历信息的句子
    :return:
    """
    s_sql = "select id, edu_exp from teacher_eduexp where type = 2"
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))

    re_title = r'讲师|教授|指导|博士生导师|研究生导师|硕士生导师|从事|研究员|所长|院长|博导|硕导'
    re_job = r'任教|任|从事|留校|留院'
    re_publish = r'《|》|出版|学报|杂志'
    re_birth = r'[1-2][9,0][0-9]{2}生|出生|生于|年生'

    update_list = []
    num = 0
    for teacher in teacher_list:
        if teacher["edu_exp"] == "" or teacher["edu_exp"] is None:
            continue
        lines = teacher["edu_exp"].split('\n')
        new_lines = []
        for line in lines:
            if re.findall(re_title, line):
                continue
            if re.findall(re_job, line):
                continue
            if re.findall(re_publish, line):
                continue
            if re.findall(re_birth, line):
                continue
            new_lines.append(line)
        if new_lines:
            print(teacher["id"])
            num += 1
            update_list.append(('\n'.join(new_lines), 2, teacher["id"]))

    print(num)
    print(len(update_list))
    update_sql = "update teacher_eduexp set exp_clear = %s, clear=%s where id = %s"
    print(dbs.exe_many(update_sql, update_list))
Пример #27
0
def get_school():
    """
    4.学校评价-school
    值  权重
    985 2
    211 1
    非  0
    :return: None
    """
    sql_initial = '''
        UPDATE teacher_rank
        SET school = 0
    '''
    print(dbs.exe_sql(sql_initial))

    s_sql = '''
        SELECT teacher.id as id, school_info.characteristic as characteristic
        FROM `teacher`, `school_info`
        WHERE teacher.school_id = school_info.id AND teacher.school_id != 0;
    '''
    import re
    teacher_list = dbs.getDics(s_sql)
    print(len(teacher_list))
    u_list = []
    for teacher in teacher_list:
        if teacher['characteristic'] is None or teacher['characteristic'] == "":
            continue
        if re.findall('985', teacher['characteristic']):
            u_list.append((2, teacher['id']))
        elif re.findall('211', teacher['characteristic']):
            u_list.append((1, teacher['id']))
    print(len(u_list))
    u_sql = '''
        UPDATE teacher_rank
        SET school=%s
        WHERE teacher_id=%s
    '''
    print(dbs.exe_many(u_sql, u_list))
    pass
Пример #28
0
def create_sheet():

    import xlwt

    s_sql = "SELECT * FROM `pss_zhuanli_clean`;"
    info_list = dbs.getDics(s_sql)
    teacher_dict = {}.fromkeys(
        open('.\\qinghua\\teacher_list.txt', 'r',
             encoding='utf-8').read().split('\n'))
    wbk = xlwt.Workbook(encoding='utf-8')
    sheet = wbk.add_sheet('sheet1')
    row = 0
    sum = 0
    for info in info_list:

        author_list = info['INVIEW'].split(';')
        teacher_list = []
        for author in author_list:
            if teacher_dict.get(author, "") != "":
                teacher_list.append(author)

        mentor_list = info['MENTOR'].split(';')
        sum += len(mentor_list)
        for mentor in mentor_list:
            if len(mentor_list) > 1:
                print("======")
            print(mentor, ";".join(teacher_list), info['TIVIEW'],
                  info['PAVIEW'], info['APD'], info['PD'])

            sheet.write(row, 0, mentor)
            sheet.write(row, 1, ";".join(teacher_list))
            sheet.write(row, 2, info['TIVIEW'])
            sheet.write(row, 3, info['PAVIEW'])
            sheet.write(row, 4, info['APD'])
            sheet.write(row, 5, info['PD'])
            row += 1

    wbk.save('.\\qinghua\\清华院士_专利信息_2018.9.30.xls')
    print(sum)
Пример #29
0
def zhuanli_guanxi_extract():
    import xlwt

    s_sql = "SELECT TEACHERS, TIVIEW FROM `pss_zhuanli_copy`;"
    info_list = dbs.getDics(s_sql)
    wbk = xlwt.Workbook(encoding='utf-8')
    sheet = wbk.add_sheet('sheet1')
    row = 0
    for info in info_list:

        teacher_list = info['TEACHERS'].split(';')
        title = info['TIVIEW']
        for i in range(0, len(teacher_list) - 1):
            for j in range(i + 1, len(teacher_list)):
                sheet.write(row, 0, teacher_list[i])
                sheet.write(row, 1, teacher_list[j])
                sheet.write(row, 2, title)
                row += 1

    wbk.save('.\\qinghua\\材料学院专利合著信息.xls')
    print(row)
    pass
Пример #30
0
def check_word():
    import re

    s_sql = "SELECT id, title, abstract FROM paper_data WHERE discipline='0812'"
    data_list = dbs.getDics(s_sql)

    for data in data_list:
        title = data['title']
        # abstract = data['abstract']
        abstract = ""
        if re.findall(r'半监督', title + abstract):
            print(data['id'])
            print(title + abstract)
        if re.findall(r'k近邻|K近邻', title + abstract):
            print(data['id'])
            print(title + abstract)
        if re.findall(r'k值|K值', title + abstract):
            print(data['id'])
            print(title + abstract)
        if re.findall(r'CNN|cnn', title + abstract):
            print(data['id'])
            print(title + abstract)