Exemplo n.º 1
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)


#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)
Exemplo n.º 2
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        soup = BeautifulSoup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = BeautifulSoup(data['html'])

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        for div in profile_div.find_all(attrs={'class': 'pf_item'}):
            k = div.find(attrs={'class': 'label'}).text.strip()
            v = div.find(attrs={'class': 'con'}).text.strip()
            if k in profile_map:
                func = (lambda s: s) \
                        if 'func' not in profile_map[k] \
                        else profile_map[k]['func']
                v = func(v)
                setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        for div in career_div.find_all(attrs={'class': 'con'}):
            work_info = WorkInfo()
            ps = div.find_all('p')
            for p in ps:
                a = p.find('a')
                if a is not None:
                    work_info.name = a.text
                    text = p.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1].strip(')')
                else:
                    text = p.text
                    if text.startswith(u'地区:'):
                        work_info.location = text.split(':', 1)[1]
                    elif text.startswith(u'职位:'):
                        work_info.position = text.split(':', 1)[1]
                    else:
                        work_info.detail = text
            weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        for div in edu_div.find_all(attrs={'class': 'con'}):
            edu_info = EduInfo()
            ps = div.find_all('p')
            for p in ps:
                a = p.find('a')
                text = p.text
                if a is not None:
                    edu_info.name = a.text
                    if '(' in text:
                        edu_info.date = text.strip().split('(')[1].strip(')')
                else:
                    edu_info.detail = text
            weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        for div in tags_div.find_all(attrs={'class': 'con'}):
            for a in div.find_all('a'):
                weibo_user.info.tags.append(a.text)

        weibo_user.save()
        return [], []
Exemplo n.º 3
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid == 'Pl_Official_LeftInfo__13':
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={'class': 'pf_item'}):
                k = div.find(attrs={'class': 'label'}).text.strip()
                v = div.find(attrs={'class': 'con'}).text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={'class': 'con'}):
                work_info = WorkInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if '(' in text:
                            work_info.date = text.strip().split('(')[1].strip(
                                ')')
                    else:
                        text = p.text
                        if text.startswith(u'地区:'):
                            work_info.location = text.split(u':', 1)[1]
                        elif text.startswith(u'职位:'):
                            work_info.position = text.split(u':', 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={'class': 'con'}):
                edu_info = EduInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if '(' in text:
                            edu_info.date = text.strip().split('(')[1].strip(
                                ')')
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={'class': 'con'}):
                for a in div.find_all('a'):
                    weibo_user.info.tags.append(a.text)

        weibo_user.save()
        self.logger.debug('parse %s finish' % url)
        return [], []
Exemplo n.º 4
0
    def parse(self, url=None):

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
            html = br.response().read()

            if not self.check(url, br):
                return

            self.uid = re.findall("CONFIG\['oid'\]='(.*)';", html)[0]
        except:
            raise FetchBannedError("get banned on blog page")

        weibo_user = self.get_weibo_user(self.uid)
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        soup = beautiful_soup(html)
        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1' and data.has_key('html'):
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']

                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'
                                      ) and data.has_key('html'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
                    bs_verified = header_soup.find(
                        'a',
                        attrs={
                            "suda-data": "key=pc_apply_entry&value=feed_icon"
                        })
                    weibo_user.info.verified = True if bs_verified else False
                    bs_vip = header_soup.find(
                        'a',
                        attrs={
                            "suda-uatrack": "key=home_vip&value=home_feed_vip"
                        })
                    weibo_user.info.vip = True if bs_vip else False
                    weibo_user.info.pf_intro = header_soup.find('div',
                                                                attrs={
                                                                    'class':
                                                                    'pf_intro'
                                                                }).text
                elif domid.startswith('Pl_Official_RightGrowNew'):
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.level_score = int(
                        header_soup.find('p', attrs={
                            'class': 'level_info'
                        }).find_all('span', attrs={'class':
                                                   'S_txt1'})[1].text.strip())
                    weibo_user.info.level = int(
                        header_soup.find('p', attrs={
                            'class': 'level_info'
                        }).find_all('span',
                                    attrs={'class': 'S_txt1'
                                           })[0].text.strip().split('.')[1])

            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'gender'
            },
            u'生日': {
                'field':
                'birth',
                'func':
                lambda v: datetime.strptime(
                    v.replace(u'年', '/').replace(u'月', '/').replace(u'日', ''),
                    '%Y/%m/%d') if re.match(u'\d+年\d+月\d+日', v) else None
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            },
            u'注册时间': {
                'field': 'register_date'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()

                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    try:
                        v = func(v)
                    except:
                        v = None
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)