def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all("script"): text = script.text if "FM.view" in text: text = text.replace("FM.view(", "")[:-1] data = json.loads(text) domid = data["domid"] if domid == "Pl_Official_LeftInfo__13": info_soup = beautiful_soup(data["html"]) info_div = info_soup.find("div", attrs={"class": "profile_pinfo"}) for block_div in info_div.find_all("div", attrs={"class": "infoblock"}): block_title = block_div.find("form").text.strip() if block_title == u"基本信息": profile_div = block_div elif block_title == u"工作信息": career_div = block_div elif block_title == u"教育信息": edu_div = block_div elif block_title == u"标签信息": tags_div = block_div elif domid == "Pl_Official_Header__1": header_soup = beautiful_soup(data["html"]) weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"] elif "STK" in text: text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1] data = json.loads(text) pid = data["pid"] if pid == "pl_profile_infoBase": profile_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoCareer": career_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoEdu": edu_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoTag": tags_div = beautiful_soup(data["html"]) elif pid == "pl_profile_photo": soup = beautiful_soup(data["html"]) weibo_user.info.avatar = soup.find("img")["src"] profile_map = { u"昵称": {"field": "nickname"}, u"所在地": {"field": "location"}, u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False}, u"生日": {"field": "birth"}, u"博客": {"field": "blog"}, u"个性域名": {"field": "site"}, u"简介": {"field": "intro"}, u"邮箱": {"field": "email"}, u"QQ": {"field": "qq"}, u"MSN": {"field": "msn"}, } if profile_div is not None: for div in profile_div.find_all(attrs={"class": "pf_item"}): k = div.find(attrs={"class": "label"}).text.strip() v = div.find(attrs={"class": "con"}).text.strip() if k in profile_map: if k == u"个性域名" and "|" in v: v = v.split("|")[1].strip() func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"] v = func(v) setattr(weibo_user.info, profile_map[k]["field"], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={"class": "con"}): work_info = WorkInfo() ps = div.find_all("p") for p in ps: a = p.find("a") if a is not None: work_info.name = a.text text = p.text if "(" in text: work_info.date = text.strip().split("(")[1].strip(")") else: text = p.text if text.startswith(u"地区:"): work_info.location = text.split(u":", 1)[1] elif text.startswith(u"职位:"): work_info.position = text.split(u":", 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={"class": "con"}): edu_info = EduInfo() ps = div.find_all("p") for p in ps: a = p.find("a") text = p.text if a is not None: edu_info.name = a.text if "(" in text: edu_info.date = text.strip().split("(")[1].strip(")") else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={"class": "con"}): for a in div.find_all("a"): weibo_user.info.tags.append(a.text) weibo_user.save() self.logger.debug("parse %s finish" % url) return [], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url try: br = self.opener.browse_open(url) except Exception as e: print(e) print('休息10分钟!') time.sleep(60 * 10) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all( 'div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find( 'h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find( 'div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={ 'class': 'pt_title' }).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split( '(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split( '(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')') + 1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={ 'class': 'pt_detail' }).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc('processed_profile_page', 1)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid == 'Pl_Official_LeftInfo__13': info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': {'field': 'nickname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex', 'func': lambda s: True if s == u'男' else False}, u'生日': {'field': 'birth'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() self.logger.debug('parse %s finish' % url) return [], []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid == 'Pl_Official_LeftInfo__13': info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } if profile_div is not None: for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip( ')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip( ')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() self.logger.debug('parse %s finish' % url) return [], []
def parse(self, url=None): if self.bundle.exists == False: return url = url or self.url br = self.opener.browse_open(url) soup = BeautifulSoup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = BeautifulSoup(data['html']) profile_map = { u'昵称': {'field': 'nickname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex', 'func': lambda s: True if s == u'男' else False}, u'生日': {'field': 'birth'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() return [], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find('h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find('div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': {'field': 'nickname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex', 'func': lambda s: True if s == u'男' else False}, u'生日': {'field': 'birth'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={'class': 'pt_title'}).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx+1], start_pos) else: end_pos = len(text) t = text[start_pos: end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')')+1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={'class': 'pt_detail'}).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc('processed_profile_page', 1)
work_info.position = contents[index].split(u':', 1)[1] else: work_info.detail = contents[index] weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all('li'): contents = [unicode(ss) for ss in div.find(attrs={'class': 'pt_detail'}).stripped_strings] names = [link.text.strip() for link in div.find_all('a')] if len(names) == 1: edu_info = EduInfo() edu_info.name = names[0] for index in range(1, len(contents)): if '(' in contents[index]: edu_info.date = contents[index].split(u'(')[1].strip(u'年)') else: edu_info.detail = contents[index] weibo_user.info.edu.append(edu_info) elif len(names) > 1: for name in names: edu_info = EduInfo() edu_info.name = name start_index = contents.index(name) for index in range(start_index+1, len(contents)): if contents[index] in names: break if '(' in contents[index]:
def parse(self, url=None): if self.bundle.exists == False: return url = url or self.url br = self.opener.browse_open(url) soup = BeautifulSoup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = BeautifulSoup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = BeautifulSoup(data['html']) profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() return [], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all("script"): text = script.text if text.startswith("FM.view"): text = text.strip().replace(";", "").replace("FM.view(", "")[:-1] data = json.loads(text) domid = data["domid"] if domid.startswith("Pl_Official_LeftInfo__"): info_soup = beautiful_soup(data["html"]) info_div = info_soup.find("div", attrs={"class": "profile_pinfo"}) for block_div in info_div.find_all("div", attrs={"class": "infoblock"}): block_title = block_div.find("form").text.strip() if block_title == u"基本信息": profile_div = block_div elif block_title == u"工作信息": career_div = block_div elif block_title == u"教育信息": edu_div = block_div elif block_title == u"标签信息": tags_div = block_div elif domid.startswith("Pl_Official_PersonalInfo__"): new_style = True info_soup = beautiful_soup(data["html"]) for block_div in info_soup.find_all("div", attrs={"class": "WB_cardwrap"}): block_title_div = block_div.find("h4", attrs={"class": "obj_name"}) if block_title_div is None: block_title_div = block_div.find("div", attrs={"class": "obj_name"}).find("h2") if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find("div", attrs={"class": "WB_innerwrap"}) if block_title == u"基本信息": profile_div = inner_div elif block_title == u"工作信息": career_div = inner_div elif block_title == u"教育信息": edu_div = inner_div elif block_title == u"标签信息": tags_div = inner_div elif domid == "Pl_Official_Header__1": header_soup = beautiful_soup(data["html"]) weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"] weibo_user.info.n_follows = int( header_soup.find("ul", attrs={"class": "user_atten"}) .find("strong", attrs={"node-type": "follow"}) .text ) weibo_user.info.n_fans = int( header_soup.find("ul", attrs={"class": "user_atten"}) .find("strong", attrs={"node-type": "fans"}) .text ) elif domid.startswith("Pl_Core_T8CustomTriColumn__"): # new style friends info header_soup = beautiful_soup(data["html"]) tds = header_soup.find("table", attrs={"class": "tb_counter"}).find_all("td") weibo_user.info.n_follows = int(tds[0].find("strong").text) weibo_user.info.n_fans = int(tds[1].find("strong").text) elif domid.startswith("Pl_Official_Headerv6__"): # new style avatar info header_soup = beautiful_soup(data["html"]) weibo_user.info.avatar = header_soup.find("p", attrs="photo_wrap").find("img")["src"] elif "STK" in text: text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1] data = json.loads(text) pid = data["pid"] if pid == "pl_profile_infoBase": profile_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoCareer": career_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoEdu": edu_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoTag": tags_div = beautiful_soup(data["html"]) elif pid == "pl_profile_photo": soup = beautiful_soup(data["html"]) weibo_user.info.avatar = soup.find("img")["src"] profile_map = { u"昵称": {"field": "nickname"}, u"所在地": {"field": "location"}, u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False}, u"生日": {"field": "birth"}, u"博客": {"field": "blog"}, u"个性域名": {"field": "site"}, u"简介": {"field": "intro"}, u"邮箱": {"field": "email"}, u"QQ": {"field": "qq"}, u"MSN": {"field": "msn"}, } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={"class": "pf_item"}) else: divs = profile_div.find_all("li", attrs={"class": "li_1"}) for div in divs: if not new_style: k = div.find(attrs={"class": "label"}).text.strip() v = div.find(attrs={"class": "con"}).text.strip() else: k = div.find("span", attrs={"class": "pt_title"}).text.strip().strip(u":") d = div.find("span", attrs={"class": "pt_detail"}) if d: v = d.text.strip() else: v = div.find("a").text.strip() if k in profile_map: if k == u"个性域名" and "|" in v: v = v.split("|")[1].strip() func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"] v = func(v) setattr(weibo_user.info, profile_map[k]["field"], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={"class": "con"}): work_info = WorkInfo() ps = div.find_all("p") for p in ps: a = p.find("a") if a is not None: work_info.name = a.text text = p.text if "(" in text: work_info.date = text.strip().split("(")[1].strip(")") else: text = p.text if text.startswith(u"地区:"): work_info.location = text.split(u":", 1)[1] elif text.startswith(u"职位:"): work_info.position = text.split(u":", 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find("li", attrs={"class": "li_1"}) for span in li.find_all("span", attrs={"class": "pt_detail"}): work_info = WorkInfo() text = span.text a = span.find("a") if a is not None: work_info.name = a.text if "(" in text: work_info.date = ( text.strip() .split("(")[1] .replace("\r", "") .replace("\n", "") .replace("\t", "") .split(")", 1)[0] ) for l in text.split("\r\n"): l = l.strip() if len(l) == 0: continue if l.startswith(u"地区:"): work_info.location = l.split(u":", 1)[1] elif l.startswith(u"职位:"): work_info.position = l.split(u":", 1)[1] else: work_info.detail = text.replace("\r", "").replace("\n", "").replace("\t", "").strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={"class": "con"}): edu_info = EduInfo() ps = div.find_all("p") for p in ps: a = p.find("a") text = p.text if a is not None: edu_info.name = a.text if "(" in text: edu_info.date = text.strip().split("(")[1].strip().strip(")") else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find("li", attrs={"class": "li_1"}).find("span", attrs={"class": "pt_detail"}) text = span.text names = [] for a in span.find_all("a"): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if "(" in text: edu_info.date = ( t.strip() .split("(")[1] .replace("\r", "") .replace("\n", "") .replace("\t", "") .split(")", 1)[0] ) t = t[t.find(")") + 1 :] text = text[end_pos:] edu_info.detail = t.replace("\r", "").replace("\n", "").replace("\t", "").strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={"class": "con"}): for a in div.find_all("a"): weibo_user.info.tags.append(a.text) else: for a in tags_div.find("span", attrs={"class": "pt_detail"}).find_all("a"): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc("processed_profile_page", 1)
def parse(self, url=None): url = url or self.url try: br = self.opener.browse_open(url) html = br.response().read() if not self.check(url, br): return self.uid = re.findall("CONFIG\['oid'\]='(.*)';", html)[0] except: raise FetchBannedError("get banned on blog page") weibo_user = self.get_weibo_user(self.uid) info = weibo_user.info if info is None: weibo_user.info = UserInfo() soup = beautiful_soup(html) new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all( 'div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find( 'h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find( 'div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1' and data.has_key('html'): header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__' ) and data.has_key('html'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] bs_verified = header_soup.find( 'a', attrs={ "suda-data": "key=pc_apply_entry&value=feed_icon" }) weibo_user.info.verified = True if bs_verified else False bs_vip = header_soup.find( 'a', attrs={ "suda-uatrack": "key=home_vip&value=home_feed_vip" }) weibo_user.info.vip = True if bs_vip else False weibo_user.info.pf_intro = header_soup.find('div', attrs={ 'class': 'pf_intro' }).text elif domid.startswith('Pl_Official_RightGrowNew'): header_soup = beautiful_soup(data['html']) weibo_user.info.level_score = int( header_soup.find('p', attrs={ 'class': 'level_info' }).find_all('span', attrs={'class': 'S_txt1'})[1].text.strip()) weibo_user.info.level = int( header_soup.find('p', attrs={ 'class': 'level_info' }).find_all('span', attrs={'class': 'S_txt1' })[0].text.strip().split('.')[1]) elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'gender' }, u'生日': { 'field': 'birth', 'func': lambda v: datetime.strptime( v.replace(u'年', '/').replace(u'月', '/').replace(u'日', ''), '%Y/%m/%d') if re.match(u'\d+年\d+月\d+日', v) else None }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' }, u'注册时间': { 'field': 'register_date' } } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={ 'class': 'pt_title' }).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] try: v = func(v) except: v = None setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split( '(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split( '(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')') + 1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={ 'class': 'pt_detail' }).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # counter add one for the profile url self.counter.inc('processed_profile_page', 1)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None weibo_ul = None rank_div = None credit_div = None head_pic_div = None user_atten_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view') and \ ("Pl_Official_LeftInfo__17" in text \ or "Pl_Official_Header__1" in text \ or "Pl_Official_RightGrow__17" in text \ or "Pl_Official_LeftInfo__36" in text \ or "Pl_Official_LeftInfo__41" in text \ or "Pl_Core_Header__1" in text \ ): text = text.replace('FM.view(', '')[:-1] if text.endswith(';'): text = text[:-1] data = json.loads(text) domid = data['domid'] if domid == 'Pl_Official_LeftInfo__17' or domid == 'Pl_Official_LeftInfo__36'\ or domid == 'Pl_Official_LeftInfo__41': info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid == 'Pl_Official_RightGrow__17': right_soup = beautiful_soup(data['html']) right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'}) for block_div in right_div.find_all('div', attrs={'class': 'info_block'}): block_title = block_div.find('form').text.strip() if block_title == u'等级信息': rank_div = block_div elif block_title == u'信用信息': credit_div = block_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_ul = header_soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_s'}) elif domid == 'Pl_Core_Header__1': core_header_soup = beautiful_soup(data['html']) head_div = core_header_soup.find('div', attrs={'class': 'pf_head S_bg5 S_line1'}) head_pic_div = head_div.find('div',attrs={'class': 'pf_head_pic'}) user_atten_div = head_div.find('div',attrs={'class': 'user_atten'}) elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoGrow': right_soup = beautiful_soup(data['html']) right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'}) for block_div in right_div.find_all('div', attrs={'class': 'info_block'}): block_title = block_div.find('form').text.strip() if block_title == u'等级信息': rank_div = block_div elif block_title == u'信用信息': credit_div = block_div elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_m'}) elif pid == 'pl_leftNav_profilePersonal': if weibo_user.info.avatar is None: soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('div',attrs={'class': 'face_infor'}).find('img')['src'] weibo_user.info.nickname = soup.find('div',attrs={'class': 'face_infor'}).find('a',attrs={'class': 'logo_img'})['title'] elif pid == 'pl_content_litePersonInfo': soup = beautiful_soup(data['html']) weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix'}) profile_map = { u'昵称': {'field': 'nickname'}, u'真实姓名': {'field': 'realname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex'}, u'性取向': {'field': 'sex_dir'}, u'生日': {'field': 'birth'}, u'感情状况': {'field': 'love'}, u'血型': {'field': 'blood_type'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) rank_map = { u'当前等级': {'field': 'rank'}, u'活跃天数': {'field': 'active_day'}, } if rank_div is not None: for div in rank_div.find_all(attrs={'class': 'info'}): k = div.text.strip()[:4] v = div.find(attrs={'class': 'S_txt1 point'}).text.strip('LV') if k in rank_map: func = (lambda s: s) \ if 'func' not in rank_map[k] \ else rank_map[k]['func'] v = func(v) setattr(weibo_user.info, rank_map[k]['field'], v) credit_map = { u'信用等级': {'field': 'credit_rank'}, u'当前信用积分': {'field': 'credit'}, } if credit_div is not None: for div in credit_div.find_all(attrs={'class': 'info'}): if u'信用等级' in div.text.strip(): k = div.text.strip()[:4] v = div.find(attrs={'class': 'S_txt1'}).text.strip() else: k = div.text.strip()[:6] v = div.find(attrs={'class': 'S_txt1 point'}).text.strip() if k in credit_map: func = (lambda s: s) \ if 'func' not in credit_map[k] \ else credit_map[k]['func'] v = func(v) setattr(weibo_user.info, credit_map[k]['field'], v) weibo_map = { u'关注': {'field': 'follow_num'}, u'粉丝': {'field': 'fans_num'}, u'微博': {'field': 'weibo_num'}, } if weibo_ul is not None: for li in weibo_ul.find_all('li'): k = li.find('span').text.strip() v = li.find('strong').text.strip() if k in weibo_map: func = (lambda s: s) \ if 'func' not in weibo_map[k] \ else weibo_map[k]['func'] v = func(v) setattr(weibo_user.info, weibo_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) if head_pic_div is not None and weibo_user.info.avatar is None: weibo_user.info.avatar = head_pic_div.find('img')['src'] weibo_user.info.nickname = head_pic_div.find('img')['title'] if weibo_ul is None and user_atten_div is not None: for td in user_atten_div.find_all('td'): k = td.find('span').text.strip() v = td.find('strong').text.strip() if k in weibo_map: func = (lambda s: s) \ if 'func' not in weibo_map[k] \ else weibo_map[k]['func'] v = func(v) setattr(weibo_user.info, weibo_map[k]['field'], v) weibo_user.save() self.logger.debug('parse %s finish' % url) return [], []