Exemplo n.º 1
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug("load %s finish" % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all("script"):
            text = script.text
            if "FM.view" in text:
                text = text.replace("FM.view(", "")[:-1]
                data = json.loads(text)
                domid = data["domid"]
                if domid == "Pl_Official_LeftInfo__13":
                    info_soup = beautiful_soup(data["html"])
                    info_div = info_soup.find("div", attrs={"class": "profile_pinfo"})
                    for block_div in info_div.find_all("div", attrs={"class": "infoblock"}):
                        block_title = block_div.find("form").text.strip()
                        if block_title == u"基本信息":
                            profile_div = block_div
                        elif block_title == u"工作信息":
                            career_div = block_div
                        elif block_title == u"教育信息":
                            edu_div = block_div
                        elif block_title == u"标签信息":
                            tags_div = block_div
                elif domid == "Pl_Official_Header__1":
                    header_soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"]
            elif "STK" in text:
                text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1]
                data = json.loads(text)
                pid = data["pid"]
                if pid == "pl_profile_infoBase":
                    profile_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoCareer":
                    career_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoEdu":
                    edu_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoTag":
                    tags_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_photo":
                    soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = soup.find("img")["src"]

        profile_map = {
            u"昵称": {"field": "nickname"},
            u"所在地": {"field": "location"},
            u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False},
            u"生日": {"field": "birth"},
            u"博客": {"field": "blog"},
            u"个性域名": {"field": "site"},
            u"简介": {"field": "intro"},
            u"邮箱": {"field": "email"},
            u"QQ": {"field": "qq"},
            u"MSN": {"field": "msn"},
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={"class": "pf_item"}):
                k = div.find(attrs={"class": "label"}).text.strip()
                v = div.find(attrs={"class": "con"}).text.strip()
                if k in profile_map:
                    if k == u"个性域名" and "|" in v:
                        v = v.split("|")[1].strip()
                    func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"]
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]["field"], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={"class": "con"}):
                work_info = WorkInfo()
                ps = div.find_all("p")
                for p in ps:
                    a = p.find("a")
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if "(" in text:
                            work_info.date = text.strip().split("(")[1].strip(")")
                    else:
                        text = p.text
                        if text.startswith(u"地区:"):
                            work_info.location = text.split(u":", 1)[1]
                        elif text.startswith(u"职位:"):
                            work_info.position = text.split(u":", 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={"class": "con"}):
                edu_info = EduInfo()
                ps = div.find_all("p")
                for p in ps:
                    a = p.find("a")
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if "(" in text:
                            edu_info.date = text.strip().split("(")[1].strip(")")
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={"class": "con"}):
                for a in div.find_all("a"):
                    weibo_user.info.tags.append(a.text)

        weibo_user.save()
        self.logger.debug("parse %s finish" % url)
        return [], []
Exemplo n.º 2
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
        except Exception as e:
            print(e)
            print('休息10分钟!')
            time.sleep(60 * 10)


#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)
Exemplo n.º 3
0
 def parse(self, url=None):
     if self.bundle.exists == False:
         return [], []
     
     url = url or self.url
     br = self.opener.browse_open(url)
     self.logger.debug('load %s finish' % url)
     soup = beautiful_soup(br.response().read())
     
     if not self.check(url, br):
         return [], []
     
     weibo_user = self.get_weibo_user()
     info = weibo_user.info
     if info is None:
         weibo_user.info = UserInfo()
         
     profile_div = None
     career_div = None
     edu_div = None
     tags_div = None
     for script in soup.find_all('script'):
         text = script.text
         if text.startswith('FM.view'):
             text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
             data = json.loads(text)
             domid = data['domid']
             if domid == 'Pl_Official_LeftInfo__13':
                 info_soup = beautiful_soup(data['html'])
                 info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                 for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                     block_title = block_div.find('form').text.strip()
                     if block_title == u'基本信息':
                         profile_div = block_div
                     elif block_title == u'工作信息':
                         career_div = block_div
                     elif block_title == u'教育信息':
                         edu_div = block_div
                     elif block_title == u'标签信息':
                         tags_div = block_div
             elif domid == 'Pl_Official_Header__1':
                 header_soup = beautiful_soup(data['html'])
                 weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                             .find('img')['src']
         elif 'STK' in text:
             text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
             data = json.loads(text)
             pid = data['pid']
             if pid == 'pl_profile_infoBase':
                 profile_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_infoCareer':
                 career_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_infoEdu':
                 edu_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_infoTag':
                 tags_div = beautiful_soup(data['html'])
             elif pid == 'pl_profile_photo':
                 soup = beautiful_soup(data['html'])
                 weibo_user.info.avatar = soup.find('img')['src']
     
     profile_map = {
         u'昵称': {'field': 'nickname'},
         u'所在地': {'field': 'location'},
         u'性别': {'field': 'sex', 
                 'func': lambda s: True if s == u'男' else False},
         u'生日': {'field': 'birth'},
         u'博客': {'field': 'blog'},
         u'个性域名': {'field': 'site'},
         u'简介': {'field': 'intro'},
         u'邮箱': {'field': 'email'},
         u'QQ': {'field': 'qq'},
         u'MSN': {'field': 'msn'}
     }
     if profile_div is not None:
         for div in profile_div.find_all(attrs={'class': 'pf_item'}):
             k = div.find(attrs={'class': 'label'}).text.strip()
             v = div.find(attrs={'class': 'con'}).text.strip()
             if k in profile_map:
                 if k == u'个性域名' and '|' in v:
                     v = v.split('|')[1].strip()
                 func = (lambda s: s) \
                         if 'func' not in profile_map[k] \
                         else profile_map[k]['func']
                 v = func(v)
                 setattr(weibo_user.info, profile_map[k]['field'], v)
             
     weibo_user.info.work = []
     if career_div is not None:
         for div in career_div.find_all(attrs={'class': 'con'}):
             work_info = WorkInfo()
             ps = div.find_all('p')
             for p in ps:
                 a = p.find('a')
                 if a is not None:
                     work_info.name = a.text
                     text = p.text
                     if '(' in text:
                         work_info.date = text.strip().split('(')[1].strip(')')
                 else:
                     text = p.text
                     if text.startswith(u'地区:'):
                         work_info.location = text.split(u':', 1)[1]
                     elif text.startswith(u'职位:'):
                         work_info.position = text.split(u':', 1)[1]
                     else:
                         work_info.detail = text
             weibo_user.info.work.append(work_info)
         
     weibo_user.info.edu = []
     if edu_div is not None:
         for div in edu_div.find_all(attrs={'class': 'con'}):
             edu_info = EduInfo()
             ps = div.find_all('p')
             for p in ps:
                 a = p.find('a')
                 text = p.text
                 if a is not None:
                     edu_info.name = a.text
                     if '(' in text:
                         edu_info.date = text.strip().split('(')[1].strip(')')
                 else:
                     edu_info.detail = text
             weibo_user.info.edu.append(edu_info)
                 
     weibo_user.info.tags = []
     if tags_div is not None:
         for div in tags_div.find_all(attrs={'class': 'con'}):
             for a in div.find_all('a'):
                 weibo_user.info.tags.append(a.text)
             
     weibo_user.save()
     self.logger.debug('parse %s finish' % url)
     return [], []
Exemplo n.º 4
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []

        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return [], []

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid == 'Pl_Official_LeftInfo__13':
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={'class': 'pf_item'}):
                k = div.find(attrs={'class': 'label'}).text.strip()
                v = div.find(attrs={'class': 'con'}).text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={'class': 'con'}):
                work_info = WorkInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if '(' in text:
                            work_info.date = text.strip().split('(')[1].strip(
                                ')')
                    else:
                        text = p.text
                        if text.startswith(u'地区:'):
                            work_info.location = text.split(u':', 1)[1]
                        elif text.startswith(u'职位:'):
                            work_info.position = text.split(u':', 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={'class': 'con'}):
                edu_info = EduInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if '(' in text:
                            edu_info.date = text.strip().split('(')[1].strip(
                                ')')
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={'class': 'con'}):
                for a in div.find_all('a'):
                    weibo_user.info.tags.append(a.text)

        weibo_user.save()
        self.logger.debug('parse %s finish' % url)
        return [], []
Exemplo n.º 5
0
 def parse(self, url=None):
     if self.bundle.exists == False:
         return
     
     url = url or self.url
     br = self.opener.browse_open(url)
     soup = BeautifulSoup(br.response().read())
     
     if not self.check(url, br):
         return
     
     weibo_user = self.get_weibo_user()
     info = weibo_user.info
     if info is None:
         weibo_user.info = UserInfo()
         
     profile_div = None
     career_div = None
     edu_div = None
     tags_div = None
     for script in soup.find_all('script'):
         text = script.text
         if 'STK' in text:
             text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
             data = json.loads(text)
             pid = data['pid']
             if pid == 'pl_profile_infoBase':
                 profile_div = BeautifulSoup(data['html'])
             elif pid == 'pl_profile_infoCareer':
                 career_div = BeautifulSoup(data['html'])
             elif pid == 'pl_profile_infoEdu':
                 edu_div = BeautifulSoup(data['html'])
             elif pid == 'pl_profile_infoTag':
                 tags_div = BeautifulSoup(data['html'])
     
     profile_map = {
         u'昵称': {'field': 'nickname'},
         u'所在地': {'field': 'location'},
         u'性别': {'field': 'sex', 
                 'func': lambda s: True if s == u'男' else False},
         u'生日': {'field': 'birth'},
         u'博客': {'field': 'blog'},
         u'个性域名': {'field': 'site'},
         u'简介': {'field': 'intro'},
         u'邮箱': {'field': 'email'},
         u'QQ': {'field': 'qq'},
         u'MSN': {'field': 'msn'}
     }
     for div in profile_div.find_all(attrs={'class': 'pf_item'}):
         k = div.find(attrs={'class': 'label'}).text.strip()
         v = div.find(attrs={'class': 'con'}).text.strip()
         if k in profile_map:
             func = (lambda s: s) \
                     if 'func' not in profile_map[k] \
                     else profile_map[k]['func']
             v = func(v)
             setattr(weibo_user.info, profile_map[k]['field'], v)
             
     weibo_user.info.work = []
     for div in career_div.find_all(attrs={'class': 'con'}):
         work_info = WorkInfo()
         ps = div.find_all('p')
         for p in ps:
             a = p.find('a')
             if a is not None:
                 work_info.name = a.text
                 text = p.text
                 if '(' in text:
                     work_info.date = text.strip().split('(')[1].strip(')')
             else:
                 text = p.text
                 if text.startswith(u'地区:'):
                     work_info.location = text.split(':', 1)[1]
                 elif text.startswith(u'职位:'):
                     work_info.position = text.split(':', 1)[1]
                 else:
                     work_info.detail = text
         weibo_user.info.work.append(work_info)
         
     weibo_user.info.edu = []
     for div in edu_div.find_all(attrs={'class': 'con'}):
         edu_info = EduInfo()
         ps = div.find_all('p')
         for p in ps:
             a = p.find('a')
             text = p.text
             if a is not None:
                 edu_info.name = a.text
                 if '(' in text:
                     edu_info.date = text.strip().split('(')[1].strip(')')
             else:
                 edu_info.detail = text
         weibo_user.info.edu.append(edu_info)
                 
     weibo_user.info.tags = []
     for div in tags_div.find_all(attrs={'class': 'con'}):
         for a in div.find_all('a'):
             weibo_user.info.tags.append(a.text)
             
     weibo_user.save()
     return [], []
Exemplo n.º 6
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return
        
        url = url or self.url
        br = self.opener.browse_open(url)
#         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())
        
        if not self.check(url, br):
            return
        
        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()
            
        new_style = False
        
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(', '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find('h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find('div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
            elif 'STK' in text:
                text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']
        
        profile_map = {
            u'昵称': {'field': 'nickname'},
            u'所在地': {'field': 'location'},
            u'性别': {'field': 'sex', 
                    'func': lambda s: True if s == u'男' else False},
            u'生日': {'field': 'birth'},
            u'博客': {'field': 'blog'},
            u'个性域名': {'field': 'site'},
            u'简介': {'field': 'intro'},
            u'邮箱': {'field': 'email'},
            u'QQ': {'field': 'qq'},
            u'MSN': {'field': 'msn'}
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={'class': 'pt_title'}).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)
                
        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split('(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()
                    
                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()
                    
                    weibo_user.info.work.append(work_info)
            
        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split('(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)
                
                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx+1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos: end_pos]
                    
                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')')+1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)
                    
        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={'class': 'pt_detail'}).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
#         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)
Exemplo n.º 7
0
                        work_info.date = contents[index].split(u'(')[1].strip(u')')
                    elif contents[index].startswith(u'地区:'):
                        work_info.location = contents[index].split(u':', 1)[1]
                    elif contents[index].startswith(u'职位:'):
                        work_info.position = contents[index].split(u':', 1)[1]
                    else:
                        work_info.detail = contents[index]
                weibo_user.info.work.append(work_info)
            
        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all('li'):
                contents = [unicode(ss) for ss in div.find(attrs={'class': 'pt_detail'}).stripped_strings]
                names = [link.text.strip() for link in div.find_all('a')]
                if len(names) == 1:
                    edu_info = EduInfo()
                    edu_info.name = names[0]
                    for index in range(1, len(contents)):
                        if '(' in contents[index]:
                            edu_info.date = contents[index].split(u'(')[1].strip(u'年)')
                        else:
                            edu_info.detail = contents[index]

                    weibo_user.info.edu.append(edu_info)
                elif len(names) > 1:
                    for name in names:
                        edu_info = EduInfo()
                        edu_info.name = name

                        start_index = contents.index(name)
                        for index in range(start_index+1, len(contents)):
Exemplo n.º 8
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        soup = BeautifulSoup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = BeautifulSoup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = BeautifulSoup(data['html'])

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'sex',
                'func': lambda s: True if s == u'男' else False
            },
            u'生日': {
                'field': 'birth'
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            }
        }
        for div in profile_div.find_all(attrs={'class': 'pf_item'}):
            k = div.find(attrs={'class': 'label'}).text.strip()
            v = div.find(attrs={'class': 'con'}).text.strip()
            if k in profile_map:
                func = (lambda s: s) \
                        if 'func' not in profile_map[k] \
                        else profile_map[k]['func']
                v = func(v)
                setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        for div in career_div.find_all(attrs={'class': 'con'}):
            work_info = WorkInfo()
            ps = div.find_all('p')
            for p in ps:
                a = p.find('a')
                if a is not None:
                    work_info.name = a.text
                    text = p.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1].strip(')')
                else:
                    text = p.text
                    if text.startswith(u'地区:'):
                        work_info.location = text.split(':', 1)[1]
                    elif text.startswith(u'职位:'):
                        work_info.position = text.split(':', 1)[1]
                    else:
                        work_info.detail = text
            weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        for div in edu_div.find_all(attrs={'class': 'con'}):
            edu_info = EduInfo()
            ps = div.find_all('p')
            for p in ps:
                a = p.find('a')
                text = p.text
                if a is not None:
                    edu_info.name = a.text
                    if '(' in text:
                        edu_info.date = text.strip().split('(')[1].strip(')')
                else:
                    edu_info.detail = text
            weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        for div in tags_div.find_all(attrs={'class': 'con'}):
            for a in div.find_all('a'):
                weibo_user.info.tags.append(a.text)

        weibo_user.save()
        return [], []
Exemplo n.º 9
0
    def parse(self, url=None):
        if self.bundle.exists is False:
            return

        url = url or self.url
        br = self.opener.browse_open(url)
        #         self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())

        if not self.check(url, br):
            return

        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all("script"):
            text = script.text
            if text.startswith("FM.view"):
                text = text.strip().replace(";", "").replace("FM.view(", "")[:-1]
                data = json.loads(text)
                domid = data["domid"]
                if domid.startswith("Pl_Official_LeftInfo__"):
                    info_soup = beautiful_soup(data["html"])
                    info_div = info_soup.find("div", attrs={"class": "profile_pinfo"})
                    for block_div in info_div.find_all("div", attrs={"class": "infoblock"}):
                        block_title = block_div.find("form").text.strip()
                        if block_title == u"基本信息":
                            profile_div = block_div
                        elif block_title == u"工作信息":
                            career_div = block_div
                        elif block_title == u"教育信息":
                            edu_div = block_div
                        elif block_title == u"标签信息":
                            tags_div = block_div
                elif domid.startswith("Pl_Official_PersonalInfo__"):
                    new_style = True
                    info_soup = beautiful_soup(data["html"])
                    for block_div in info_soup.find_all("div", attrs={"class": "WB_cardwrap"}):
                        block_title_div = block_div.find("h4", attrs={"class": "obj_name"})
                        if block_title_div is None:
                            block_title_div = block_div.find("div", attrs={"class": "obj_name"}).find("h2")
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find("div", attrs={"class": "WB_innerwrap"})
                        if block_title == u"基本信息":
                            profile_div = inner_div
                        elif block_title == u"工作信息":
                            career_div = inner_div
                        elif block_title == u"教育信息":
                            edu_div = inner_div
                        elif block_title == u"标签信息":
                            tags_div = inner_div
                elif domid == "Pl_Official_Header__1":
                    header_soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"]
                    weibo_user.info.n_follows = int(
                        header_soup.find("ul", attrs={"class": "user_atten"})
                        .find("strong", attrs={"node-type": "follow"})
                        .text
                    )
                    weibo_user.info.n_fans = int(
                        header_soup.find("ul", attrs={"class": "user_atten"})
                        .find("strong", attrs={"node-type": "fans"})
                        .text
                    )
                elif domid.startswith("Pl_Core_T8CustomTriColumn__"):
                    # new style friends info
                    header_soup = beautiful_soup(data["html"])
                    tds = header_soup.find("table", attrs={"class": "tb_counter"}).find_all("td")
                    weibo_user.info.n_follows = int(tds[0].find("strong").text)
                    weibo_user.info.n_fans = int(tds[1].find("strong").text)
                elif domid.startswith("Pl_Official_Headerv6__"):
                    # new style avatar info
                    header_soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = header_soup.find("p", attrs="photo_wrap").find("img")["src"]
            elif "STK" in text:
                text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1]
                data = json.loads(text)
                pid = data["pid"]
                if pid == "pl_profile_infoBase":
                    profile_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoCareer":
                    career_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoEdu":
                    edu_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_infoTag":
                    tags_div = beautiful_soup(data["html"])
                elif pid == "pl_profile_photo":
                    soup = beautiful_soup(data["html"])
                    weibo_user.info.avatar = soup.find("img")["src"]

        profile_map = {
            u"昵称": {"field": "nickname"},
            u"所在地": {"field": "location"},
            u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False},
            u"生日": {"field": "birth"},
            u"博客": {"field": "blog"},
            u"个性域名": {"field": "site"},
            u"简介": {"field": "intro"},
            u"邮箱": {"field": "email"},
            u"QQ": {"field": "qq"},
            u"MSN": {"field": "msn"},
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={"class": "pf_item"})
            else:
                divs = profile_div.find_all("li", attrs={"class": "li_1"})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={"class": "label"}).text.strip()
                    v = div.find(attrs={"class": "con"}).text.strip()
                else:
                    k = div.find("span", attrs={"class": "pt_title"}).text.strip().strip(u":")
                    d = div.find("span", attrs={"class": "pt_detail"})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find("a").text.strip()
                if k in profile_map:
                    if k == u"个性域名" and "|" in v:
                        v = v.split("|")[1].strip()
                    func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"]
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]["field"], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={"class": "con"}):
                    work_info = WorkInfo()
                    ps = div.find_all("p")
                    for p in ps:
                        a = p.find("a")
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if "(" in text:
                                work_info.date = text.strip().split("(")[1].strip(")")
                        else:
                            text = p.text
                            if text.startswith(u"地区:"):
                                work_info.location = text.split(u":", 1)[1]
                            elif text.startswith(u"职位:"):
                                work_info.position = text.split(u":", 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find("li", attrs={"class": "li_1"})
                for span in li.find_all("span", attrs={"class": "pt_detail"}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find("a")
                    if a is not None:
                        work_info.name = a.text
                    if "(" in text:
                        work_info.date = (
                            text.strip()
                            .split("(")[1]
                            .replace("\r", "")
                            .replace("\n", "")
                            .replace("\t", "")
                            .split(")", 1)[0]
                        )

                    for l in text.split("\r\n"):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u"地区:"):
                            work_info.location = l.split(u":", 1)[1]
                        elif l.startswith(u"职位:"):
                            work_info.position = l.split(u":", 1)[1]
                        else:
                            work_info.detail = text.replace("\r", "").replace("\n", "").replace("\t", "").strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={"class": "con"}):
                    edu_info = EduInfo()
                    ps = div.find_all("p")
                    for p in ps:
                        a = p.find("a")
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if "(" in text:
                                edu_info.date = text.strip().split("(")[1].strip().strip(")")
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find("li", attrs={"class": "li_1"}).find("span", attrs={"class": "pt_detail"})
                text = span.text
                names = []
                for a in span.find_all("a"):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if "(" in text:
                        edu_info.date = (
                            t.strip()
                            .split("(")[1]
                            .replace("\r", "")
                            .replace("\n", "")
                            .replace("\t", "")
                            .split(")", 1)[0]
                        )
                        t = t[t.find(")") + 1 :]
                    text = text[end_pos:]
                    edu_info.detail = t.replace("\r", "").replace("\n", "").replace("\t", "").strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={"class": "con"}):
                    for a in div.find_all("a"):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find("span", attrs={"class": "pt_detail"}).find_all("a"):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()
        #         self.logger.debug('parse %s finish' % url)

        # counter add one for the profile url
        self.counter.inc("processed_profile_page", 1)
Exemplo n.º 10
0
    def parse(self, url=None):

        url = url or self.url
        try:
            br = self.opener.browse_open(url)
            html = br.response().read()

            if not self.check(url, br):
                return

            self.uid = re.findall("CONFIG\['oid'\]='(.*)';", html)[0]
        except:
            raise FetchBannedError("get banned on blog page")

        weibo_user = self.get_weibo_user(self.uid)
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()

        soup = beautiful_soup(html)
        new_style = False

        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        for script in soup.find_all('script'):
            text = script.text
            if text.startswith('FM.view'):
                text = text.strip().replace(';', '').replace('FM.view(',
                                                             '')[:-1]
                data = json.loads(text)
                domid = data['domid']
                if domid.startswith('Pl_Official_LeftInfo__'):
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div',
                                              attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all(
                            'div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid.startswith('Pl_Official_PersonalInfo__'):
                    new_style = True
                    info_soup = beautiful_soup(data['html'])
                    for block_div in info_soup.find_all(
                            'div', attrs={'class': 'WB_cardwrap'}):
                        block_title_div = block_div.find(
                            'h4', attrs={'class': 'obj_name'})
                        if block_title_div is None:
                            block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\
                                .find('h2')
                        if block_title_div is None:
                            continue
                        block_title = block_title_div.text.strip()
                        inner_div = block_div.find(
                            'div', attrs={'class': 'WB_innerwrap'})
                        if block_title == u'基本信息':
                            profile_div = inner_div
                        elif block_title == u'工作信息':
                            career_div = inner_div
                        elif block_title == u'教育信息':
                            edu_div = inner_div
                        elif block_title == u'标签信息':
                            tags_div = inner_div
                elif domid == 'Pl_Official_Header__1' and data.has_key('html'):
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']

                    weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                    .find('strong', attrs={'node-type': 'follow'}).text)
                    weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\
                                                 .find('strong', attrs={'node-type': 'fans'}).text)
                elif domid.startswith('Pl_Core_T8CustomTriColumn__'
                                      ) and data.has_key('html'):
                    # new style friends info
                    header_soup = beautiful_soup(data['html'])
                    tds = header_soup.find('table', attrs={'class': 'tb_counter'})\
                                                .find_all('td')
                    weibo_user.info.n_follows = int(tds[0].find('strong').text)
                    weibo_user.info.n_fans = int(tds[1].find('strong').text)
                elif domid.startswith('Pl_Official_Headerv6__'):
                    # new style avatar info
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\
                                                .find('img')['src']
                    bs_verified = header_soup.find(
                        'a',
                        attrs={
                            "suda-data": "key=pc_apply_entry&value=feed_icon"
                        })
                    weibo_user.info.verified = True if bs_verified else False
                    bs_vip = header_soup.find(
                        'a',
                        attrs={
                            "suda-uatrack": "key=home_vip&value=home_feed_vip"
                        })
                    weibo_user.info.vip = True if bs_vip else False
                    weibo_user.info.pf_intro = header_soup.find('div',
                                                                attrs={
                                                                    'class':
                                                                    'pf_intro'
                                                                }).text
                elif domid.startswith('Pl_Official_RightGrowNew'):
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.level_score = int(
                        header_soup.find('p', attrs={
                            'class': 'level_info'
                        }).find_all('span', attrs={'class':
                                                   'S_txt1'})[1].text.strip())
                    weibo_user.info.level = int(
                        header_soup.find('p', attrs={
                            'class': 'level_info'
                        }).find_all('span',
                                    attrs={'class': 'S_txt1'
                                           })[0].text.strip().split('.')[1])

            elif 'STK' in text:
                text = text.replace(
                    'STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']

        profile_map = {
            u'昵称': {
                'field': 'nickname'
            },
            u'所在地': {
                'field': 'location'
            },
            u'性别': {
                'field': 'gender'
            },
            u'生日': {
                'field':
                'birth',
                'func':
                lambda v: datetime.strptime(
                    v.replace(u'年', '/').replace(u'月', '/').replace(u'日', ''),
                    '%Y/%m/%d') if re.match(u'\d+年\d+月\d+日', v) else None
            },
            u'博客': {
                'field': 'blog'
            },
            u'个性域名': {
                'field': 'site'
            },
            u'简介': {
                'field': 'intro'
            },
            u'邮箱': {
                'field': 'email'
            },
            u'QQ': {
                'field': 'qq'
            },
            u'MSN': {
                'field': 'msn'
            },
            u'注册时间': {
                'field': 'register_date'
            }
        }
        if profile_div is not None:
            if not new_style:
                divs = profile_div.find_all(attrs={'class': 'pf_item'})
            else:
                divs = profile_div.find_all('li', attrs={'class': 'li_1'})
            for div in divs:
                if not new_style:
                    k = div.find(attrs={'class': 'label'}).text.strip()
                    v = div.find(attrs={'class': 'con'}).text.strip()
                else:
                    k = div.find('span', attrs={
                        'class': 'pt_title'
                    }).text.strip().strip(u':')
                    d = div.find('span', attrs={'class': 'pt_detail'})
                    if d:
                        v = d.text.strip()
                    else:
                        v = div.find('a').text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()

                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    try:
                        v = func(v)
                    except:
                        v = None
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            if not new_style:
                for div in career_div.find_all(attrs={'class': 'con'}):
                    work_info = WorkInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        if a is not None:
                            work_info.name = a.text
                            text = p.text
                            if '(' in text:
                                work_info.date = text.strip().split(
                                    '(')[1].strip(')')
                        else:
                            text = p.text
                            if text.startswith(u'地区:'):
                                work_info.location = text.split(u':', 1)[1]
                            elif text.startswith(u'职位:'):
                                work_info.position = text.split(u':', 1)[1]
                            else:
                                work_info.detail = text
                    weibo_user.info.work.append(work_info)
            else:
                li = career_div.find('li', attrs={'class': 'li_1'})
                for span in li.find_all('span', attrs={'class': 'pt_detail'}):
                    work_info = WorkInfo()

                    text = span.text
                    a = span.find('a')
                    if a is not None:
                        work_info.name = a.text
                    if '(' in text:
                        work_info.date = text.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]

                    for l in text.split('\r\n'):
                        l = l.strip()
                        if len(l) == 0:
                            continue
                        if l.startswith(u'地区:'):
                            work_info.location = l.split(u':', 1)[1]
                        elif l.startswith(u'职位:'):
                            work_info.position = l.split(u':', 1)[1]
                        else:
                            work_info.detail = text.replace('\r', '')\
                                                    .replace('\n', '')\
                                                    .replace('\t', '')\
                                                    .strip()

                    weibo_user.info.work.append(work_info)

        weibo_user.info.edu = []
        if edu_div is not None:
            if not new_style:
                for div in edu_div.find_all(attrs={'class': 'con'}):
                    edu_info = EduInfo()
                    ps = div.find_all('p')
                    for p in ps:
                        a = p.find('a')
                        text = p.text
                        if a is not None:
                            edu_info.name = a.text
                            if '(' in text:
                                edu_info.date = text.strip().split(
                                    '(')[1].strip().strip(')')
                        else:
                            edu_info.detail = text
                    weibo_user.info.edu.append(edu_info)
            else:
                span = edu_div.find('li', attrs={'class': 'li_1'})\
                                .find('span', attrs={'class': 'pt_detail'})
                text = span.text
                names = []
                for a in span.find_all('a'):
                    names.append(a.text)

                for idx, name in enumerate(names):
                    start_pos = text.find(name) + len(name)
                    if idx < len(names) - 1:
                        end_pos = text.find(names[idx + 1], start_pos)
                    else:
                        end_pos = len(text)
                    t = text[start_pos:end_pos]

                    edu_info = EduInfo()
                    edu_info.name = name
                    if '(' in text:
                        edu_info.date = t.strip().split('(')[1]\
                                            .replace('\r', '')\
                                            .replace('\n', '')\
                                            .replace('\t', '')\
                                            .split(')', 1)[0]
                        t = t[t.find(')') + 1:]
                    text = text[end_pos:]
                    edu_info.detail = t.replace('\r', '').replace('\n', '')\
                                        .replace('\t', '').strip()
                    weibo_user.info.edu.append(edu_info)

        weibo_user.info.tags = []
        if tags_div is not None:
            if not new_style:
                for div in tags_div.find_all(attrs={'class': 'con'}):
                    for a in div.find_all('a'):
                        weibo_user.info.tags.append(a.text)
            else:
                for a in tags_div.find('span', attrs={
                        'class': 'pt_detail'
                }).find_all('a'):
                    weibo_user.info.tags.append(a.text.strip())

        weibo_user.save()

        # counter add one for the profile url
        self.counter.inc('processed_profile_page', 1)
Exemplo n.º 11
0
    def parse(self, url=None):
        if self.bundle.exists == False:
            return [], []
        
        url = url or self.url
        br = self.opener.browse_open(url)
        self.logger.debug('load %s finish' % url)
        soup = beautiful_soup(br.response().read())
        
        if not self.check(url, br):
            return [], []
        
        weibo_user = self.get_weibo_user()
        info = weibo_user.info
        if info is None:
            weibo_user.info = UserInfo()
            
        profile_div = None
        career_div = None
        edu_div = None
        tags_div = None
        weibo_ul = None
        rank_div = None
        credit_div = None
        head_pic_div = None
        user_atten_div = None
        for script in soup.find_all('script'):
            text = script.text
            
            if text.startswith('FM.view') and \
               ("Pl_Official_LeftInfo__17" in text \
                or "Pl_Official_Header__1" in text \
                or "Pl_Official_RightGrow__17" in text \
                or "Pl_Official_LeftInfo__36" in text \
                or "Pl_Official_LeftInfo__41" in text \
                or "Pl_Core_Header__1" in text \
                ):
                text = text.replace('FM.view(', '')[:-1]
                if text.endswith(';'):
		    text = text[:-1]

                data = json.loads(text)
                domid = data['domid']
                if domid == 'Pl_Official_LeftInfo__17' or domid == 'Pl_Official_LeftInfo__36'\
                   or domid == 'Pl_Official_LeftInfo__41':
                    info_soup = beautiful_soup(data['html'])
                    info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'})
                    for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'基本信息':
                            profile_div = block_div
                        elif block_title == u'工作信息':
                            career_div = block_div
                        elif block_title == u'教育信息':
                            edu_div = block_div
                        elif block_title == u'标签信息':
                            tags_div = block_div
                elif domid == 'Pl_Official_RightGrow__17':
                    right_soup = beautiful_soup(data['html'])
                    right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'})
                    
                    for block_div in right_div.find_all('div', attrs={'class': 'info_block'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'等级信息':
                            rank_div = block_div
                            
                        elif block_title == u'信用信息':
                            credit_div = block_div
                           
                elif domid == 'Pl_Official_Header__1':
                    header_soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\
                                                .find('img')['src']
                    weibo_ul = header_soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_s'})

                elif domid == 'Pl_Core_Header__1':
                    core_header_soup = beautiful_soup(data['html'])
                    head_div = core_header_soup.find('div', attrs={'class': 'pf_head S_bg5 S_line1'})
                    head_pic_div = head_div.find('div',attrs={'class': 'pf_head_pic'})
                    user_atten_div = head_div.find('div',attrs={'class': 'user_atten'})
                   
            elif 'STK' in text:
                text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1]
                data = json.loads(text)
                pid = data['pid']
                if pid == 'pl_profile_infoBase':
                    profile_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoCareer':
                    career_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoEdu':
                    edu_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoTag':
                    tags_div = beautiful_soup(data['html'])
                elif pid == 'pl_profile_infoGrow':
                    right_soup = beautiful_soup(data['html'])
                    right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'})
                    for block_div in right_div.find_all('div', attrs={'class': 'info_block'}):
                        block_title = block_div.find('form').text.strip()
                        if block_title == u'等级信息':
                            rank_div = block_div
                        elif block_title == u'信用信息':
                            credit_div = block_div
                elif pid == 'pl_profile_photo':
                    soup = beautiful_soup(data['html'])
                    weibo_user.info.avatar = soup.find('img')['src']
                    weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_m'})
                elif pid == 'pl_leftNav_profilePersonal':
                    if weibo_user.info.avatar is None:
                        soup = beautiful_soup(data['html'])
                        weibo_user.info.avatar = soup.find('div',attrs={'class': 'face_infor'}).find('img')['src']
                        weibo_user.info.nickname = soup.find('div',attrs={'class': 'face_infor'}).find('a',attrs={'class': 'logo_img'})['title']
                elif pid == 'pl_content_litePersonInfo':
                    soup = beautiful_soup(data['html'])
                    weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix'})

        profile_map = {
            u'昵称': {'field': 'nickname'},
            u'真实姓名': {'field': 'realname'},
            u'所在地': {'field': 'location'},
            u'性别': {'field': 'sex'},
            u'性取向': {'field': 'sex_dir'},
            u'生日': {'field': 'birth'},
            u'感情状况': {'field': 'love'},
            u'血型': {'field': 'blood_type'},
            u'博客': {'field': 'blog'},
            u'个性域名': {'field': 'site'},
            u'简介': {'field': 'intro'},
            u'邮箱': {'field': 'email'},
            u'QQ': {'field': 'qq'},
            u'MSN': {'field': 'msn'}
        }
        if profile_div is not None:
            for div in profile_div.find_all(attrs={'class': 'pf_item'}):
                k = div.find(attrs={'class': 'label'}).text.strip()
                v = div.find(attrs={'class': 'con'}).text.strip()
                if k in profile_map:
                    if k == u'个性域名' and '|' in v:
                        v = v.split('|')[1].strip()
                    func = (lambda s: s) \
                            if 'func' not in profile_map[k] \
                            else profile_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, profile_map[k]['field'], v)

        rank_map = {
	    u'当前等级': {'field': 'rank'},
            u'活跃天数': {'field': 'active_day'},
	}
        if rank_div is not None:
            for div in rank_div.find_all(attrs={'class': 'info'}):
                k = div.text.strip()[:4]
                v = div.find(attrs={'class': 'S_txt1 point'}).text.strip('LV')
                if k in rank_map:
                    func = (lambda s: s) \
                            if 'func' not in rank_map[k] \
                            else rank_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, rank_map[k]['field'], v)

        credit_map = {
	    u'信用等级': {'field': 'credit_rank'},
            u'当前信用积分': {'field': 'credit'},
	}
        if credit_div is not None:
            for div in credit_div.find_all(attrs={'class': 'info'}):
                if u'信用等级' in div.text.strip():
                    k = div.text.strip()[:4]
                    v = div.find(attrs={'class': 'S_txt1'}).text.strip()
                else:
                    k = div.text.strip()[:6]
                    v = div.find(attrs={'class': 'S_txt1 point'}).text.strip()
                if k in credit_map:
                    func = (lambda s: s) \
                            if 'func' not in credit_map[k] \
                            else credit_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, credit_map[k]['field'], v)

        weibo_map = {
	    u'关注': {'field': 'follow_num'},
            u'粉丝': {'field': 'fans_num'},
            u'微博': {'field': 'weibo_num'},
	}
        if weibo_ul is not None:
            for li in weibo_ul.find_all('li'):
                k = li.find('span').text.strip()
                v = li.find('strong').text.strip()
                if k in weibo_map:
                    func = (lambda s: s) \
                            if 'func' not in weibo_map[k] \
                            else weibo_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, weibo_map[k]['field'], v)

        weibo_user.info.work = []
        if career_div is not None:
            for div in career_div.find_all(attrs={'class': 'con'}):
                work_info = WorkInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    if a is not None:
                        work_info.name = a.text
                        text = p.text
                        if '(' in text:
                            work_info.date = text.strip().split('(')[1].strip(')')
                    else:
                        text = p.text
                        if text.startswith(u'地区:'):
                            work_info.location = text.split(u':', 1)[1]
                        elif text.startswith(u'职位:'):
                            work_info.position = text.split(u':', 1)[1]
                        else:
                            work_info.detail = text
                weibo_user.info.work.append(work_info)
            
        weibo_user.info.edu = []
        if edu_div is not None:
            for div in edu_div.find_all(attrs={'class': 'con'}):
                edu_info = EduInfo()
                ps = div.find_all('p')
                for p in ps:
                    a = p.find('a')
                    text = p.text
                    if a is not None:
                        edu_info.name = a.text
                        if '(' in text:
                            edu_info.date = text.strip().split('(')[1].strip(')')
                    else:
                        edu_info.detail = text
                weibo_user.info.edu.append(edu_info)
                    
        weibo_user.info.tags = []
        if tags_div is not None:
            for div in tags_div.find_all(attrs={'class': 'con'}):
                for a in div.find_all('a'):
                    weibo_user.info.tags.append(a.text)

        if head_pic_div is not None and weibo_user.info.avatar is None:
            weibo_user.info.avatar = head_pic_div.find('img')['src']
            weibo_user.info.nickname = head_pic_div.find('img')['title']
            
        if weibo_ul is None and user_atten_div is not None:
            for td in user_atten_div.find_all('td'):
                k = td.find('span').text.strip()
                v = td.find('strong').text.strip()
                if k in weibo_map:
                    func = (lambda s: s) \
                            if 'func' not in weibo_map[k] \
                            else weibo_map[k]['func']
                    v = func(v)
                    setattr(weibo_user.info, weibo_map[k]['field'], v)
                
        weibo_user.save()
        self.logger.debug('parse %s finish' % url)
        return [], []