Пример #1
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", clasS_="content-wrapper", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    infos_div = div.find_all('div', attrs={"id": "column-1"})
    if infos_div and len(infos_div) != 0:
        div = infos_div[0]

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           max_line=256)
    return parser.parse()
Пример #2
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "phy-main"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    #email
    email_div = soup.find_all(name='a', class_="phy-mail")
    if email_div and len(email_div) != 0:
        employee.email = email_div[0].get_text().strip()

    te_div = soup.find_all(name='a', class_="phy-phone")
    if te_div and len(te_div) != 0:
        employee.tel = te_div[0].get_text().strip()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #3
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="s2_right_con", limit=1)
    if not divs or len(divs) == 0:
        print("can't find div???")
        div = soup
        #return employee
    else:
        div = divs[0]
        if not os.path.exists(filename):
            with open(filename, 'wb') as fp:
                content = div.prettify()
                fp.write(content)
                fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #4
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)
        
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    
    lis = soup.find_all(name="li")
    if not lis and len(lis) != 5:
        div = soup
    else:
        ass = lis[4].find_all('a')
        if len(ass) != 0:
            li_url = ass[0]['href']
            newUrl = urljoin(url,li_url)
            newDoc = get_doc_byUrllib2(newUrl)
            soup = BeautifulSoup(newDoc, Config.SOUP_PARSER)
            mainDiv = soup.find_all('div',attrs={"id":"main"})

            if not mainDiv or len(mainDiv) == 0:
                print "not found main div"
                div = soup
            else:
                div = mainDiv[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True)
    return parser.parse()
Пример #5
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="box_rt01 list", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    h3s = div.find_all('h3')
    if h3s and len(h3s) != 0:
        title = h3s[0].get_text()
        title = ''.join(title.split())
        print title
        for t in PROFILE_TITLES:
            if t in title:
                employee.title = title
                print "got => " + title
                break
    else:
        print "not found h3"
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee, force_email=True)
    return parser.parse()
Пример #6
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="line20 dataName", limit=1)
    if not divs or len(divs) == 0:
        divs = soup.find_all(name="div", class_="rightArea clearfix ", limit=1)
        if not divs or len(divs) == 0:
            div = soup
        else:
            div = divs[0]
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=999)
    return parser.parse()
Пример #7
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td", class_="bd-content", limit=1)
    if not divs or len(divs) == 0:
        divs = soup.find_all(name="td", attrs={"width": "79%"}, limit=1)
        if not divs or len(divs) == 0:
            with open(filename, 'wb') as fp:
                content = doc
                fp.write(content)
                fp.close()
            return employee

    div = divs[0]
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
Пример #8
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="right", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="lf0104", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,force_email=True)
    return parser.parse()
Пример #9
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td",attrs={"valign":"center"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    lines = []
    tds = div.find_all('td')
    if len(tds) == 0:
        lines = div.stripped_strings
        print "TDS none!"
    else:
        for td in tds:
            string = td.get_text().strip()
            if len(string) < 128:
                string = ''.join(string.split())
                print string
                lines.append(string)

    # 使用纯文本方式处理
    #lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=profile_set_attr_hook,max_line=256)
    return parser.parse()
Пример #10
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class": "main"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           max_line=999,
                           force_email=True,
                           force_tel=False)
    return parser.parse()
Пример #11
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", clasS_="content-wrapper", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    infos_div = div.find_all('div',attrs={"id":"column-1"})
    if infos_div and len(infos_div) != 0:
        div = infos_div[0]

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256)
    return parser.parse()
Пример #12
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"newsContent"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    details = soup.find_all(name="span", attrs={"id":"ctl00_ContentPlaceHolder1_NewsView1_lbl_NewsContent"}, limit=1)
    if not details or len(details) == 0:
        return employee
    # 使用纯文本方式处理
    lines = details[0].stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #13
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="box_rt01 list", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    h3s = div.find_all('h3')
    if h3s and len(h3s) != 0:
        title = h3s[0].get_text()
        title = ''.join(title.split())
        print title
        for t in PROFILE_TITLES:
            if t in title:
                employee.title = title
                print "got => " + title
                break
    else:
        print "not found h3"
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,force_email=True)
    return parser.parse()
Пример #14
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"right_2"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    researches = [' ',' ']
    tds = div.find_all(name="td",attrs={"bgcolor":"#FFFFFF","class":"ft12","valign":"top"},limit=4)
    if len(tds) == 4:
        researches[0] = tds[2].get_text().strip()
        researches[1] = tds[3].get_text().strip()
        employee.research = researches[0] + ";" +researches[1]
        print "research:" + employee.research 
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,ignore=set(['research']))
    return parser.parse()
Пример #15
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class": "right-nr"})
    if not divs or len(divs) == 0:
        print("div class=right-nr not found")
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings

    # text=div.get_text(strip=True)
    # ,set_attr_hook=set_attr_hook
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
Пример #16
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td", class_="bd-content", limit=1)
    if not divs or len(divs) == 0:
        divs = soup.find_all(name="td", attrs={"width": "79%"}, limit=1)
        if not divs or len(divs) == 0:
            with open(filename, "wb") as fp:
                content = doc
                fp.write(content)
                fp.close()
            return employee

    div = divs[0]
    with open(filename, "wb") as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
Пример #17
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class": "NewsArticles"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, "wb") as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(
        lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=999, force_email=True, force_tel=False
    )
    return parser.parse()
Пример #18
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"phy-main"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    #email
    #email_div = soup.find_all(name='a',class_="phy-mail")
    #if email_div and len(email_div) != 0:
    #    employee.email = email_div[0].get_text().strip()
    #
    #te_div = soup.find_all(name='a',class_="phy-phone")
    #if te_div and len(te_div) != 0:
    #    employee.tel = te_div[0].get_text().strip()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256,ignore=set(['title','research']))
    return parser.parse()
Пример #19
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="right", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="lf0104", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee, force_email=True)
    return parser.parse()
Пример #20
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    
    #div_header = soup.find_all(name="div", attrs={"class":"neiye-shizi-title"}, limit=1)
    
    divs = soup.find_all(name="div", attrs={"class":"xinwen-txt_3"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,ignore=set(['fax']))
    return parser.parse()
Пример #21
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td", attrs={"bgcolor": "#FFFFFF"}, limit=1)
    if not divs or len(divs) == 0:
        with open(filename, 'wb') as fp:
            content = doc
            fp.write(content)
            fp.close()
        return employee

    div = divs[0]
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=profile_set_attr_hook)
    return parser.parse()
Пример #22
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"right-nr"})
    if not divs or len(divs) == 0:
        print("div class=right-nr not found")
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    # 使用纯文本方式处理
    lines = div.stripped_strings

    # text=div.get_text(strip=True)
    # ,set_attr_hook=set_attr_hook
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Пример #23
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class": "darea"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    dnodes = div.find_all(name='div', class_=u"dnode")
    if not dnodes or len(dnodes) == 0:
        return employee

    lines = None
    target_node = None
    done = False
    for node in dnodes:
        lines = node.stripped_strings
        for count, line in enumerate(lines):
            if count >= 2:
                break
            if line == u'联系方式':
                print "binggo!"
                target_node = node
                done = True
                break
        if done:
            break

    if not target_node:
        return employee

    lines = []
    trs = node.find_all('tr')
    if trs and len(trs) != 0:
        for tr in trs:
            text = tr.get_text()
            if text:
                text = ''.join(text.split())
                lines.append(text)
    else:
        lines = node.stripped_strings
    #lines = target_node.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #24
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div",
                         attrs={"class": "page_right addpage_right"},
                         limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department = tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title = tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research = tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class": "text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #25
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"darea"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    dnodes = div.find_all(name='div',class_=u"dnode")
    if not dnodes or len(dnodes) == 0:
        return employee
    
    lines = None
    target_node = None
    done = False
    for node in dnodes:
        lines = node.stripped_strings
        for count,line in enumerate(lines):
            if count >= 2:
                break;
            if line == u'联系方式':
                print "binggo!"
                target_node = node
                done = True
                break
        if done:
            break
    
    if not target_node:
        return employee
    
    lines = []
    trs = node.find_all('tr')
    if trs and len(trs) != 0:
        for tr in trs:
            text = tr.get_text()
            if text:
                text = ''.join(text.split())
                lines.append(text)
    else:
        lines = node.stripped_strings
    #lines = target_node.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #26
0
def handler(tag):
    employee = Employee()
    name_divs = tag.find_all("div", class_="teacher-title")
    if name_divs and len(name_divs) != 0:
        employee.name = name_divs[0].get_text()
        employee.name = ''.join(employee.name.split())

    # 使用纯文本方式处理
    lines = tag.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
Пример #27
0
def handler(tag):
    employee = Employee()
    name_divs = tag.find_all("div",class_="teacher-title")
    if name_divs and len(name_divs) != 0:
        employee.name = name_divs[0].get_text()
        employee.name = ''.join(employee.name.split())
    
    # 使用纯文本方式处理
    lines = tag.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Пример #28
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1)
    if not divs or len(divs) == 0:
        div= soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department =  tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title =  tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research =  tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #29
0
def handler(tag):
    employee = Employee()
    ass = tag.find_all('a',class_="orangea")
    if ass and len(ass) != 0:
        employee.name = ass[0].get_text()
        employee.name = ''.join(employee.name.split())
        employee.profile = ass[0]['href']
    
    ass = tag.find_all('a',class_="black01")
    if ass and len(ass) != 0:
        lines = ass[0].stripped_strings
        parser = ProfileParser(lines=lines,employee=employee)
        employee = parser.parse()
    return employee
Пример #30
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"maincontent"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    divs = div.find_all(class_="other")
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    lines = []
    spans = div.find_all('span')
    for child in spans:
        line = child.get_text()
        if line:
            line = ''.join(line.split())
            if not line:
                continue
            if len(line) != 0:
                lines.append(line)
    if len(lines) == 0:
        return emplo
    #email
    #email_div = soup.find_all(name='a',class_="phy-mail")
    #if email_div and len(email_div) != 0:
    #    employee.email = email_div[0].get_text().strip()
    #
    #te_div = soup.find_all(name='a',class_="phy-phone")
    #if te_div and len(te_div) != 0:
    #    employee.tel = te_div[0].get_text().strip()

    # 使用纯文本方式处理
    #lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
Пример #31
0
def handler(tag):
    employee = Employee()
    ass = tag.find_all('a', class_="orangea")
    if ass and len(ass) != 0:
        employee.name = ass[0].get_text()
        employee.name = ''.join(employee.name.split())
        employee.profile = ass[0]['href']

    ass = tag.find_all('a', class_="black01")
    if ass and len(ass) != 0:
        lines = ass[0].stripped_strings
        parser = ProfileParser(lines=lines, employee=employee)
        employee = parser.parse()
    return employee
Пример #32
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="table",
                         attrs={
                             "width": "96%",
                             "cellspacing": "0"
                         },
                         limit=1)
    if not divs or len(divs) == 0:
        print "not found main div"
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    divs = soup.find_all(name="table",
                         attrs={
                             "width": "96%",
                             "cellspacing": "1"
                         },
                         limit=1)
    if not divs or len(divs) == 0:
        print "not found main div"
        div = soup
    else:
        div = divs[0]

    ass = div.find_all('a', text="点击此处访问")
    if ass and len(ass) != 0:
        employee.profile = ass[0]['href']
        print 'Got profile:' + employee.profile

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           max_line=256)
    return parser.parse()
Пример #33
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    email_image_filename = os.path.join(path, name + "_email.png")
    tel_image_filename = os.path.join(path, name + "_tel.png")

    employee = Employee(name=name, url=url)
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # email image
    item_divs = div.find_all(name="div", attrs={"class": "item_list"})

    ignores = []
    for div in item_divs:
        string = div.get_text()
        if string and len(string) != 0:
            if u'邮件' in string and len(employee.email) == 0:
                employee.email = image2text(imageSrc(div),
                                            email_image_filename, 'eng2')
                print(employee.email)
                ignores.append('email')
            elif u'电话' in string and len(employee.tel) == 0:
                employee.tel = image2text(imageSrc(div), tel_image_filename,
                                          'eng')
                print(employee.tel)
                ignores.append('tel')

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           max_line=256,
                           ignore=set(ignores))
    return parser.parse()
Пример #34
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    div = soup
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
Пример #35
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    div = soup
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Пример #36
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    email_image_filename = os.path.join(path, name + "_email.png")
    tel_image_filename = os.path.join(path, name + "_tel.png")

    employee = Employee(name=name, url=url)
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, "wb") as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # email image
    item_divs = div.find_all(name="div", attrs={"class": "item_list"})

    ignores = []
    for div in item_divs:
        string = div.get_text()
        if string and len(string) != 0:
            if u"邮件" in string and len(employee.email) == 0:
                employee.email = image2text(imageSrc(div), email_image_filename, "eng2")
                print(employee.email)
                ignores.append("email")
            elif u"电话" in string and len(employee.tel) == 0:
                employee.tel = image2text(imageSrc(div), tel_image_filename, "eng")
                print(employee.tel)
                ignores.append("tel")

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(
        lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores)
    )
    return parser.parse()
Пример #37
0
def handler(tag):
    employee = Employee()

    lines = tag.stripped_strings

    ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"})
    if not ass or len(ass) == 0:
        # first line is the name
        for count, line in enumerate(lines):
            employee.name = line
            break
    else:
        employee.name = ass[0].string
        employee.profile = ass[0]['href']
        employee.url = employee.profile

    parser = ProfileParser(lines=lines, employee=employee)
    employee = parser.parse()
    return employee
Пример #38
0
def handler(tag):
    employee = Employee()

    lines = tag.stripped_strings

    ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"})
    if not ass or len(ass) == 0:
        # first line is the name
        for count, line in enumerate(lines):
            employee.name = line
            break
    else:
        employee.name = ass[0].string
        employee.profile = ass[0]["href"]
        employee.url = employee.profile

    parser = ProfileParser(lines=lines, employee=employee)
    employee = parser.parse()
    return employee
Пример #39
0
def handler(tag):
    
    name_spans = tag.find_all(class_="handle")
    if not name_spans or len(name_spans) == 0:
        return None
    
    # js <span class="handle" onclick="toCardDetailAction('10c07e70-3fb6-42af-aa26-bfab26b6ce0406');" style="color:#2084D2;font-size: 16px;">艾明晶</span>
    
    employee = Employee()
    employee.name = name_spans[0].get_text()
    employee.name = ''.join(employee.name.split())
    
    card_id = name_spans[0]['onclick'][len('toCardDetailAction(\''):-3]
    employee.url = 'http://scse.buaa.edu.cn/buaa-css-web/toCardDetailAction.action?firstSelId=CARD_TMPL_OF_FIRST_NAVI_CN%20&%20secondSelId=CARD_TMPL_OF_ALL_TEACHER_CN%20&cardId='+card_id
    print ("card_id=[%s]"%card_id)

    
    lines = tag.stripped_strings
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Пример #40
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="xq_teacher", limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 找到科研方向
    details = div.find_all("div", class_="con01_t", limit=3)
    if details and len(details) >= 2:
        employee.research = details[1].get_text()
        employee.research = ''.join(employee.research.split())
        # 过滤掉太短的串
        if len(employee.research) <= (len(u'研究方向') + 1):
            employee.research = ''
        else:
            employee.research.replace(',', ',')

    # 解析其他各人信息
    infos = div.find_all("div", class_="wz_teacher", limit=1)
    if infos and len(infos) != 0:
        # 使用纯文本方式处理
        lines = infos[0].stripped_strings
        parser = ProfileParser(lines=lines,
                               employee=employee,
                               set_attr_hook=set_attr_hook,
                               force_email=True)
        return parser.parse()
    else:
        return employee
Пример #41
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td",attrs={"bgcolor":"#FFFFFF"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=profile_set_attr_hook)
    return parser.parse()
Пример #42
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"work"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True)
    return parser.parse()
Пример #43
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="xq_teacher", limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
        
    # 找到科研方向
    details = div.find_all("div",class_="con01_t",limit=3)
    if details and len(details) >= 2:
        employee.research = details[1].get_text()
        employee.research = ''.join(employee.research.split())
        # 过滤掉太短的串
        if len(employee.research) <= (len(u'研究方向')+1):
            employee.research = ''
        else:
            employee.research.replace(',',',')
    
    # 解析其他各人信息
    infos = div.find_all("div",class_="wz_teacher",limit=1)
    if infos and len(infos) != 0:
        # 使用纯文本方式处理
        lines = infos[0].stripped_strings
        parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True)
        return parser.parse()
    else:
        return employee
Пример #44
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)

    lis = soup.find_all(name="li")
    if not lis and len(lis) != 5:
        div = soup
    else:
        ass = lis[4].find_all('a')
        if len(ass) != 0:
            li_url = ass[0]['href']
            newUrl = urljoin(url, li_url)
            newDoc = get_doc_byUrllib2(newUrl)
            soup = BeautifulSoup(newDoc, Config.SOUP_PARSER)
            mainDiv = soup.find_all('div', attrs={"id": "main"})

            if not mainDiv or len(mainDiv) == 0:
                print "not found main div"
                div = soup
            else:
                div = mainDiv[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           force_email=True)
    return parser.parse()
Пример #45
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "right_2"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    researches = [' ', ' ']
    tds = div.find_all(name="td",
                       attrs={
                           "bgcolor": "#FFFFFF",
                           "class": "ft12",
                           "valign": "top"
                       },
                       limit=4)
    if len(tds) == 4:
        researches[0] = tds[2].get_text().strip()
        researches[1] = tds[3].get_text().strip()
        employee.research = researches[0] + ";" + researches[1]
        print "research:" + employee.research
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           ignore=set(['research']))
    return parser.parse()
Пример #46
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td", attrs={"valign": "center"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    lines = []
    tds = div.find_all('td')
    if len(tds) == 0:
        lines = div.stripped_strings
        print "TDS none!"
    else:
        for td in tds:
            string = td.get_text().strip()
            if len(string) < 128:
                string = ''.join(string.split())
                print string
                lines.append(string)

    # 使用纯文本方式处理
    #lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=profile_set_attr_hook,
                           max_line=256)
    return parser.parse()
Пример #47
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    tables = soup.find_all(name="table",limit=4)
    if len(tables)  < 2:
        return employee

    tabel_content = tables[3]
    with open(filename, 'wb') as fp:
        content = tabel_content.prettify()
        fp.write(content)
        fp.close()

    td = tabel_content.find_all("td",attrs={"valign":"top","width":"577"})
    if not td or len(td) == 0:
        return employee

    # 提取各人信息
    lines = td[0].stripped_strings
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Пример #48
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="table", attrs={"width":"96%","cellspacing":"0"}, limit=1)    
    if not divs or len(divs) == 0:
        print "not found main div"
        div = soup
    else:
        div = divs[0]
    
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    divs = soup.find_all(name="table", attrs={"width":"96%","cellspacing":"1"}, limit=1)
    if not divs or len(divs) == 0:
        print "not found main div"
        div = soup
    else:
        div = divs[0]

    ass = div.find_all('a',text="点击此处访问")
    if ass and len(ass) != 0:
        employee.profile = ass[0]['href']
        print 'Got profile:' + employee.profile

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256)
    return parser.parse()
Пример #49
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    tables = soup.find_all(name="table", limit=4)
    if len(tables) < 2:
        return employee

    tabel_content = tables[3]
    with open(filename, 'wb') as fp:
        content = tabel_content.prettify()
        fp.write(content)
        fp.close()

    td = tabel_content.find_all("td", attrs={"valign": "top", "width": "577"})
    if not td or len(td) == 0:
        return employee

    # 提取各人信息
    lines = td[0].stripped_strings
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()