示例#1
0
def extractdir(path, dirname1, dirname2):
    #获取所有已经分词的文件
    text_list = []
    for root, dirs, files in os.walk(path):  #默认为广度优先
        if files:
            for name in files:
                text_list.append(os.path.join(root, name))
    #读取正则表达式
    filename = '/home/chenxl/data_mining_resume/task_2/regexpression.txt'
    dict_ = read_regex(filename)
    #信息抽取
    for txt in text_list:
        # print(txt)
        print("extracting the file %s" % (txt))
        dirpath = op.dirname(txt)  #获取路径
        dirpath = dirpath.replace(dirname1, dirname2)
        # print(dirpath)
        if not op.exists(dirpath):  #创建简历文件夹
            os.mkdir(dirpath)
        txtname = op.basename(txt)  #获取最底层的名字(文件名)
        save_path = op.join(dirpath, txtname)  #将二者结合成新的路径
        lines = read_academician(txt)
        dict_jiben = traversal(dict_, lines)
        work, time = main_han('./task_2/title_list.py', txt)
        dict_jiben["人物履历"]["工作经历"]["人物经历"] = work
        dict_jiben["人物履历"]["工作经历"]["任职经历"] = time
        save_dict(dict_jiben, save_path)
        print("save the resumefile %s" % (save_path))
    print("一共抽取了%s个文件。" % (len(text_list)))
示例#2
0
def extract_from_file(raw_info_path, reg_file):

    #先获取院士文件夹下所有的文件,进行抽取
    text_list = []
    for root, dirs, files in os.walk(raw_info_path):  #默认为广度优先,root为当前目录,
        #dirs为当前目录下的目录,files为当前目录下的文件。
        # print(root,files)
        if files:  #如果文件目录不空
            for name in files:  #遍历每一个文件
                text_list.append(os.path.join(root, name))  #组合成新的文件名

    #对每一个文件进行分词与抽取
    for txt in text_list:
        print(txt + " 正在被处理...............")
        dirpath = op.dirname(txt)  #获取txt的路径
        dirpath = dirpath.replace("原始院士信息", '抽取的院士信息')
        if not op.exists(dirpath):  #创建新文件夹
            os.mkdir(dirpath)
        txtname = op.basename(txt)  #获取最底层的名字(文件名)
        with open(txt, "r", encoding='utf-8') as f:  #将文件内容读出来
            text = f.readlines()
        text_cut = words_cut(text, False)  #分词处理,使用波森分词
        work = main_han(text_cut)  #工作经历抽取
        extract_info = traversal(read_regex(reg_file), text_cut)  #抽取其它信息
        extract_info['人物履历']['工作经历']['博士后'] = ''
        extract_info["人物履历"]["工作经历"]["任职"] = work
        extract_info['人物履历']['工作经历']['任免_辞职'] = ''

        extract_info["院士名"] = txtname.split("_")[0]

        if "360" in txtname:
            extract_info["百科名"] = "360百科"
        if "搜狗百科" in txtname:
            extract_info["百科名"] = "搜狗百科"
        if "互动百科" in txtname:
            extract_info["百科名"] = "互动百科"
        if "百度百科" in txtname:
            extract_info["百科名"] = "百度百科"

        #替换文件夹,将二者结合成新的路径
        save_path = op.join(dirpath, txtname).replace(".txt", ".json")
        with open(save_path, "w", encoding="utf8") as f:
            json.dump(extract_info, f, indent=4, ensure_ascii=False)
        print("{}已保存!".format(txtname.replace(".txt", ".json")))
示例#3
0
def main_(name):
    '''name:院士的姓名'''
    reg_file = "/home/chenxl/data_mining_resume/task_2/regexpression.txt"
    info = {}
    #搜狗百科
    t1 = time.time()
    sougou = sougou_extract(name)  #爬取信息
    t2 = time.time()
    # print("crawer_time:", t2 - t1)
    # print("sougou-raw:\n", sougou)
    t3 = time.time()
    sougou = words_cut(sougou, False)  # 分词处理
    t4 = time.time()
    # print("bosen_cut_time:", t4 - t3)

    t3 = time.time()
    words_cut(sougou)  # 分词处理
    t4 = time.time()
    # print("jieba_cut_time:", t4 - t3)
    # print("cut:\n", sougou)
    work = main_han(sougou)
    # print(work)
    sougou = traversal(read_regex(reg_file), sougou)  # 抽取信息
    sougou['人物履历']['工作经历']['博士后'] = ''
    sougou["人物履历"]["工作经历"]["任职"] = work
    sougou['人物履历']['工作经历']['任免_辞职'] = ''
    sougou["院士名"] = name
    sougou["百科名"] = "搜狗百科"
    # print("sougou:\n", sougou)
    info["sougou"] = sougou
    #with open(name+"sougou.json","w",encoding="utf8") as f:
    #    json.dump(sougou, f, indent=4, ensure_ascii=False)

    #百度百科
    t1 = time.time()
    baidu = baidu_extract(name)  # 爬取信息
    # print("time:", time.time() - t1)
    # print("baidu-raw:\n", baidu)
    t0 = time.time()
    baidu = words_cut(baidu, False)  #分词处理
    # print("baidu_cut:", baidu)
    # print("bosen_cut_time:", time.time() - t0)
    t0 = time.time()
    words_cut(baidu)  #分词处理
    # print("jieba_cut_time:", time.time() - t0)
    t0 = time.time()
    work = main_han(baidu)  #工作经历抽取
    # print("main_han_time:", time.time() - t0)
    # print(work)
    t0 = time.time()
    baidu = traversal(read_regex(reg_file), baidu)  #抽取其它信息
    # print(baidu)
    # print("ours_time:", time.time() - t0)
    baidu['人物履历']['工作经历']['博士后'] = ''
    baidu["人物履历"]["工作经历"]["任职"] = work
    baidu['人物履历']['工作经历']['任免_辞职'] = ''
    baidu["院士名"] = name
    baidu["百科名"] = "百度百科"
    # print("baidu:\n", baidu)
    info["baidu"] = baidu
    #with open(name+"baidu.json","w",encoding="utf8") as f:
    #    json.dump(baidu, f, indent=4, ensure_ascii=False)

    # #互动百科
    t1 = time.time()
    hudong = hudong_extract(name)
    # print("time:", time.time() - t1)
    # print("hudong-raw:\n", hudong)
    t0 = time.time()
    hudong = words_cut(hudong, False)  # 分词处理
    # print("bosen_cut_time:", time.time() - t0)
    t0 = time.time()
    words_cut(hudong)  # 分词处理
    # print("jieba_cut_time:", time.time() - t0)
    # print("cut:\n", hudong)
    work = main_han(hudong)
    hudong = traversal(read_regex(reg_file), hudong)
    hudong['人物履历']['工作经历']['博士后'] = ''
    hudong["人物履历"]["工作经历"]["任职"] = work
    hudong['人物履历']['工作经历']['任免_辞职'] = ''
    hudong["院士名"] = name
    hudong["百科名"] = "互动百科"
    # print("hudong:\n", hudong)
    info["hudong"] = hudong
    #with open(name+"hudong.json","w",encoding="utf8") as f:
    #    json.dump(hudong, f, indent=4, ensure_ascii=False)

    #360百科
    t1 = time.time()
    bk360 = bk360_extract(name)
    # print("time:", time.time() - t1)
    # print("bk360-raw:\n", bk360)
    t0 = time.time()
    bk360 = words_cut(bk360, False)  # 分词处理
    # print("bosen_cut_time:", time.time() - t0)
    t0 = time.time()
    words_cut(bk360)  # 分词处理
    # print("jieba_cut_time:", time.time() - t0)
    # print("cut:\n", bk360)
    work = main_han(bk360)
    bk360 = traversal(read_regex(reg_file), bk360)
    bk360['人物履历']['工作经历']['博士后'] = ''
    bk360["人物履历"]["工作经历"]["任职"] = work
    bk360['人物履历']['工作经历']['任免_辞职'] = ''
    bk360["院士名"] = name
    bk360["百科名"] = "360百科"
    # print("360:\n", bk360)
    info["bk360"] = bk360
    #with open(name+"360.json","w",encoding="utf8") as f:
    #    json.dump(bk360, f, indent=4, ensure_ascii=False)

    with open(name + "info.json", "w", encoding="utf8") as f:
        json.dump(info, f, indent=4, ensure_ascii=False)
    extract_result = [baidu, sougou, hudong, bk360]
    #print(extract_result)
    return extract_result