def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "work"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) tables = soup.find_all(name="table", limit=4) if len(tables) < 2: return employee tabel_content = tables[3] with open(filename, 'wb') as fp: content = tabel_content.prettify() fp.write(content) fp.close() td = tabel_content.find_all("td", attrs={"valign": "top", "width": "577"}) if not td or len(td) == 0: return employee # 提取各人信息 lines = td[0].stripped_strings parser = ProfileParser(lines=lines, employee=employee) return parser.parse()