예제 #1
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "work"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           force_email=True)
    return parser.parse()
예제 #2
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    tables = soup.find_all(name="table", limit=4)
    if len(tables) < 2:
        return employee

    tabel_content = tables[3]
    with open(filename, 'wb') as fp:
        content = tabel_content.prettify()
        fp.write(content)
        fp.close()

    td = tabel_content.find_all("td", attrs={"valign": "top", "width": "577"})
    if not td or len(td) == 0:
        return employee

    # 提取各人信息
    lines = td[0].stripped_strings
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()