Exemplo n.º 1
0
def get_detail(html):
    """
    这个是从认证企业的个人资料页面解析数据,一般不用这个
    :param html:
    :return:
    """
    details = {}
    cont = public.get_right(html)
    soup = BeautifulSoup(cont, 'html.parser')
    basic_modules = soup.find_all(attrs={'class': 'WB_cardwrap S_bg2'})
    basic_info = soup.find_all(attrs={'class': 'li_1 clearfix'})
    for each_module in basic_modules:
        if '简介' in each_module.get_text():
            description = each_module.find(attrs={
                'node-type': 'text'
            }).get_text().replace('\n', '')
            details['description'] = description
        if '基本讯息' in each_module.get_text():
            for each in basic_info:
                if '友情链接' in each.get_text():
                    friend_links = each.find(attrs={
                        'class': 'pt_detail'
                    }).get_text()
                    details['friend_links'] = friend_links
    return details
Exemplo n.º 2
0
def get_detail(html):
    """
    这个是从认证企业的个人资料页面解析数据,一般不用这个
    :param html:
    :return:
    """
    details = {}
    cont = public.get_right(html)
    soup = BeautifulSoup(cont, 'html.parser')
    basic_modules = soup.find_all(attrs={'class': 'WB_cardwrap S_bg2'})
    basic_info = soup.find_all(attrs={'class': 'li_1 clearfix'})
    for each_module in basic_modules:
        if '简介' in each_module.get_text():
            description = each_module.find(attrs={'node-type': 'text'}).get_text().replace('\n', '')
            details['description'] = description
        if '基本讯息' in each_module.get_text():
            for each in basic_info:
                if '友情链接' in each.get_text():
                    friend_links = each.find(attrs={'class': 'pt_detail'}).get_text()
                    details['friend_links'] = friend_links
    return details
Exemplo n.º 3
0
def get_detail(html):
    user = User()
    cont = public.get_right(html)
    if cont == '':
        return None
    soup = BeautifulSoup(cont, 'html.parser')
    basic_modules = soup.find_all(attrs={'class': 'WB_cardwrap S_bg2'})
    basic_info = soup.find_all(attrs={'class': 'li_1 clearfix'})
    for each_module in basic_modules:
        try:
            basic_str = each_module.find(attrs={'class': 'main_title W_fb W_f14'}).get_text()
            if '基本信息' in basic_str:
                for each in basic_info:
                    each_str = each.get_text()
                    if '昵称:' in each_str:
                        user.name = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '所在地:' in each_str:
                        user.location = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '性别:' in each_str:
                        gender = each.find(attrs={'class': 'pt_detail'}).get_text()
                        if gender == '男':
                            user.gender = 1
                        elif gender == '女':
                            user.gender = 2
                        else:
                            user.gender = 0
                    elif '生日:' in each_str:
                        user.birthday = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '简介:' in each_str:
                        description = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.description = description.encode('gbk', 'ignore').decode('gbk')
                    elif '注册时间:' in each_str:
                        user.register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
                            '\r\n', '')

            if '标签信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                for each in basic_info:
                    if '标签:' in each.get_text():
                        user.tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
                            '\n\n\n', '') .strip().replace('\r\n', ';')

            if '教育信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                for each in basic_info:
                    if '大学:' in each.get_text():
                        user.education_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \
                            .replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';')

            if '工作信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                jobs_info = []
                for each in basic_info:
                    if '公司:' in each.get_text():
                        jobs = each.find_all(attrs={'class': 'pt_detail'})
                        for job in jobs:
                            jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', ''))
                user.work_info = ';'.join(jobs_info)

            if '联系信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                contact_info = []
                for each in basic_info:
                    if 'QQ:' in each.get_text():
                        contact_info.append(
                            'qq:' + each.find(attrs={'class': 'pt_detail'}).get_text().replace('\n', ''))
                    if '邮箱:' in each.get_text():
                        contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                    if 'MSN:' in each.get_text():
                        contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                user.contact_info = ';'.join(contact_info)
        except Exception as why:
            print('解析出错,具体原因为{why}'.format(why=why))

    return user
Exemplo n.º 4
0
def get_detail(html):
    user = User()
    cont = public.get_right(html)
    if cont == '':
        return None
    soup = BeautifulSoup(cont, 'html.parser')
    basic_modules = soup.find_all(attrs={'class': 'WB_cardwrap S_bg2'})
    basic_info = soup.find_all(attrs={'class': 'li_1 clearfix'})
    for each_module in basic_modules:
        try:
            basic_str = each_module.find(attrs={'class': 'main_title W_fb W_f14'}).get_text()
            if '基本信息' in basic_str:
                for each in basic_info:
                    each_str = each.get_text()
                    if '昵称:' in each_str:
                        user.name = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '所在地:' in each_str:
                        user.location = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '性别:' in each_str:
                        gender = each.find(attrs={'class': 'pt_detail'}).get_text()
                        if gender == '男':
                            user.gender = 1
                        elif gender == '女':
                            user.gender = 2
                        else:
                            user.gender = 0
                    elif '生日:' in each_str:
                        user.birthday = each.find(attrs={'class': 'pt_detail'}).get_text()
                    elif '简介:' in each_str:
                        description = each.find(attrs={'class': 'pt_detail'}).get_text()
                        user.description = description.encode('gbk', 'ignore').decode('gbk')
                    elif '注册时间:' in each_str:
                        user.register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
                            '\r\n', '')

            if '标签信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                for each in basic_info:
                    if '标签:' in each.get_text():
                        user.tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
                            '\n\n\n', '') .strip().replace('\r\n', ';')

            if '教育信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                for each in basic_info:
                    if '大学:' in each.get_text():
                        user.education_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \
                            .replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';')

            if '工作信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                jobs_info = []
                for each in basic_info:
                    if '公司:' in each.get_text():
                        jobs = each.find_all(attrs={'class': 'pt_detail'})
                        for job in jobs:
                            jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', ''))
                user.work_info = ';'.join(jobs_info)

            if '联系信息' in basic_str:
                basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
                contact_info = []
                for each in basic_info:
                    if 'QQ:' in each.get_text():
                        contact_info.append(
                            'qq:' + each.find(attrs={'class': 'pt_detail'}).get_text().replace('\n', ''))
                    if '邮箱:' in each.get_text():
                        contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                    if 'MSN:' in each.get_text():
                        contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text())
                user.contact_info = ';'.join(contact_info)
        except Exception as why:
            print('解析出错,具体原因为{why}'.format(why=why))

    return user