def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None relation_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) if 'domid' not in data or 'html' not in data: # self.logger.warn('domid or html is missing, url:%s' % url) continue domid = data['domid'] if domid.startswith('Pl_Official_PersonalInfo__'): info_soup = beautiful_soup(data['html'])
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() html = None decodes = urldecode(url) is_follow = True is_new_mode = False is_banned = True for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): if is_banned: is_banned = False text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftHisRelation__') or \ domid.startswith('Pl_Official_HisRelation__'): html = beautiful_soup(data['html']) if 'relate' in decodes and decodes['relate'] == 'fans': is_follow = False is_new_mode = True elif 'STK' in text: if is_banned: is_banned = False text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) if data['pid'] == 'pl_relation_hisFollow' or \ data['pid'] == 'pl_relation_hisFans': html = beautiful_soup(data['html']) if data['pid'] == 'pl_relation_hisFans': is_follow = False if is_banned: raise FetchBannedError('fetch banned by weibo server') ul = None try: ul = html.find(attrs={'class': 'cnfList', 'node-type': 'userListBox'}) if ul is None: ul = html.find(attrs={'class': 'follow_list', 'node-type': 'userListBox'}) except AttributeError, e: if br.geturl().startswith('http://e.weibo.com'): return raise e
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None texts = [script.text.strip().replace(';', '').replace('FM.view(', '')[:-1] for script in soup.find_all('script') if script.text.startswith('FM.view') ] for text in texts: try: data = json.loads(text) except ValueError, e: return [], [] domid = data['domid'] if domid.startswith('Pl_Official_PersonalInfo__'): info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap S_bg2'}): block_title = block_div.find('span', attrs={'class': 'main_title'}).text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_Header'): header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_photo'})\ .find('img')['src'] elif domid.startswith('Pl_Core_T8CustomTriColumn'): follow_soup = beautiful_soup(data['html']) follows = follow_soup.find_all('td', attrs={'class': 'S_line1'}) weibo_user.info.n_follows = int(follows[0].find('strong').text) weibo_user.info.n_fans = int(follows[1].find('strong').text)
def _get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) output = beautiful_soup("<div/>") for sibling in best_candidate['elem'].parent.contents: if isinstance(sibling, NavigableString): continue append = False if sibling is best_candidate['elem']: append = True sibling_key = HashableElement(sibling) if sibling_key in candidates and \ candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.name == "p": link_density = self._get_link_density(sibling) node_content = sibling.string or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: output.div.append(sibling) return output
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() html = None decodes = urldecode(url) is_follow = True is_new_mode = False for script in soup.find_all("script"): text = script.text if text.startswith("FM.view"): text = text.strip().replace(";", "").replace("FM.view(", "")[:-1] data = json.loads(text) domid = data["domid"] if domid.startswith("Pl_Official_LeftHisRelation__") or domid.startswith("Pl_Official_HisRelation__"): html = beautiful_soup(data["html"]) if "relate" in decodes and decodes["relate"] == "fans": is_follow = False is_new_mode = True elif "STK" in text: text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1] data = json.loads(text) if data["pid"] == "pl_relation_hisFollow" or data["pid"] == "pl_relation_hisFans": html = beautiful_soup(data["html"]) if data["pid"] == "pl_relation_hisFans": is_follow = False ul = None try: ul = html.find(attrs={"class": "cnfList", "node-type": "userListBox"}) if ul is None: ul = html.find(attrs={"class": "follow_list", "node-type": "userListBox"}) except AttributeError, e: if br.geturl().startswith("http://e.weibo.com"): return raise e
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br, soup = None, None try: br = self.opener.browse_open(url) soup = beautiful_soup(br.response().read()) except Exception, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists == False or self.bundle.level >= MAX_LEVEL: return [], [] url = url or self.url br, soup = None, None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) except Exception, e: return self._error(url, e)
def process(self, base_url=None): self.html = self._remove_crufy_html(self.html) self.soup = beautiful_soup(self.html, self.logger) base_url = self.base_url or base_url if base_url is not None: self._fix_references(base_url) title = self.get_title(self.soup) body = self.get_body(self.soup) return title, body
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() weibo_user.qids = [] lis = soup.find_all('li', attrs={'action-type': 'click_link'}) for li in lis: if li.has_attr('action-data'): li_data = li['action-data'] if '/' in li_data: weibo_user.qids.append(li_data.rsplit('/', 1)[1]) weibo_user.save() return [], []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find( 'a', attrs={ 'action-type': re.compile("^(feed_list|fl)_forward$") }) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date_source = dl.find('dd').find('span', attrs={'class': 'S_txt2'}) if date_source is not None: date = date_source.text else: date_source = dl.find('dd').find('span',attrs={'class':'fl'}).find('em',attrs={'class': 'S_txt2'}) date = date_source.text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: if fetch_comment_limit > 0 and self.bundle.fetched_weibo_comment_num >= fetch_comment_limit: self.bundle.fetched_weibo_comment_num = 0; try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e) return [],[] link = dl.find('a',attrs={'action-type': 'replycomment'}) data = dict([l.split('=') for l in link['action-data'].split('&')]) if fetch_comment_limit > 0 and self.bundle.fetched_last_comment_id != data['mid']: self.bundle.fetched_weibo_comment_num = 0; comment = Comment(uid=data['ouid'], mid=data['mid']) set_instance(comment, dl) mblog.comments.append(comment) self.bundle.fetched_last_comment_id = data['mid'] self.bundle.fetched_weibo_comment_num = self.bundle.fetched_weibo_comment_num + 1;
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find('h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find('div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': {'field': 'nickname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex', 'func': lambda s: True if s == u'男' else False}, u'生日': {'field': 'birth'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={'class': 'pt_title'}).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx+1], start_pos) else: end_pos = len(text) t = text[start_pos: end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')')+1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={'class': 'pt_detail'}).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc('processed_profile_page', 1)
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) try: jsn = json.loads(br.response().read()) except ValueError: raise FetchBannedError('fetch banned by weibo server') # self.logger.debug('load %s finish' % url) try: soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] except KeyError: raise FetchBannedError('fetch banned by weibo server') if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find('a', attrs={'action-type': re.compile("^(feed_list|fl)_forward$")}) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s'%(current_page+1)) params.update(new_params) params['__rnd'] = int(time.time()*1000) next_page = '%s?%s' % (url.split('?')[0] , urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) jsn = json.loads(br.response().read()) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(jsn["data"]["html"]) current_page = jsn["data"]["page"]["pagenum"] n_pages = jsn["data"]["page"]["totalpage"] if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get("id", decodes.get("mid")) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find("dt").find("img")["src"] date = dl.find("dd").find(attrs={"class": "S_txt2"}).text date = date.strip().strip("(").strip(")") instance.created = self.parse_datetime(date) for div in dl.find_all("div"): div.extract() for span in dl.find_all("span"): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith("http://weibo.com/aj/comment"): counter_type = "comment" dls = soup.find_all("dl", mid=True) for dl in dls: uid = dl.find("a", usercard=True)["usercard"].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith("http://weibo.com/aj/mblog/info"): counter_type = "forward" dls = soup.find_all("dl", mid=True) for dl in dls: forward_again_a = dl.find("a", attrs={"action-type": re.compile("^(feed_list|fl)_forward$")}) uid = urldecode("?%s" % forward_again_a["action-data"])["uid"] forward = Forward(uid=uid, mid=dl["mid"]) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith("http://weibo.com/aj/like"): counter_type = "like" lis = soup.find_all("li", uid=True) for li in lis: like = Like(uid=li["uid"]) like.avatar = li.find("img")["src"] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc("processed_%s_list_page" % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode("?page=%s" % (current_page + 1)) params.update(new_params) params["__rnd"] = int(time.time() * 1000) next_page = "%s?%s" % (url.split("?")[0], urllib.urlencode(params)) yield next_page
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']); mblog.content = content_div.text is_forward = div.get('isforward') == '1' if is_forward: name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % ( name_a.text, text_a.text ) mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in self.bundle.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() if url.startswith('http://weibo.com/aj/comment'): dls = soup.find_all('dl', mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): dls = soup.find_all('dl', mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) try: mblog.save() self.logger.debug('parse %s finish' % url) except ValidationError, e: return self._error(url, e)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = None jsn = None try: br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) jsn = json.loads(br.response().read()) except (ValueError, URLError) as e: return self._error(url, e) soup = beautiful_soup(jsn["data"]["html"]) current_page = jsn["data"]["page"]["pagenum"] n_pages = jsn["data"]["page"]["totalpage"] if not self.check(url, br): return [], [] decodes = urldecode(url) mid = decodes.get("id", decodes.get("mid")) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find("dt").find("img")["src"] date = dl.find("dd").find("span", attrs={"class": "S_txt2"}).text date = date.strip().strip("(").strip(")") instance.created = self.parse_datetime(date) for div in dl.find_all("div"): div.extract() for span in dl.find_all("span"): span.extract() instance.content = dl.text.strip() if url.startswith("http://weibo.com/aj/comment"): dls = soup.find_all("dl", mid=True) for dl in dls: comment = Comment(uid=self.uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith("http://weibo.com/aj/mblog/info"): dls = soup.find_all("dl", mid=True) for dl in dls: forward = Forward(uid=self.uid, mid=dl["mid"]) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith("http://weibo.com/aj/like"): lis = soup.find_all("li", uid=True) for li in lis: like = Like(uid=li["uid"]) like.avatar = li.find("img")["src"] mblog.likes.append(like) try: mblog.save() self.logger.debug("parse %s finish" % url) except ValidationError, e: return self._error(url, e)
# info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) >>>>>>> origin/master for block_div in info_soup.find_all('div', attrs={'class': 'WB_cardwrap'}): block_title = block_div.find('div', attrs={'class':'WB_cardtitle_b'}).text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'联系信息': relation_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Core_T8CustomTriColumn__'): header_soup = beautiful_soup(data['html']) <<<<<<< HEAD links = header_soup.find_all('a') ======= links = header_soup.find_all('a') >>>>>>> origin/master if len(links) == 3: weibo_user.info.n_follows = int(links[0].find('strong').text) weibo_user.info.n_fans = int(links[1].find('strong').text) weibo_user.info.n_weibos = int(links[2].find('strong').text) <<<<<<< HEAD elif domid.startswith('Pl_Official_RightGrowNew__'): right_soup = beautiful_soup(data['html']) level_div = right_soup.find('div', attrs={'class': 'level_box'}) if level_div is not None: for info_span in level_div.find_all(attrs={'class': 'info'}):
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid weibo_user.likes.append(mid) weibo_user.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0: return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) mblog.content = content_div.text # Links for content_a in content_div.find_all('a', attrs={'action-type': 'feed_list_url'}): href = content_a['href'] if href not in mblog.links: mblog.links.append(href) # tags tags_div = content_div.find('div', attrs={'class': 'wTablist2'}) if tags_div is not None: for tag_a in tags_div.find_all('a'): tag = tag_a.text.strip() if len(tag) > 0 and tag not in mblog.tags: mblog.tags.append(tag) is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break likes = div.find('a', attrs={'action-type': 'feed_list_like'}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find('a', attrs={'action-type': 'feed_list_forward'}).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = div.find('a', attrs={'action-type': 'feed_list_comment'}).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None weibo_ul = None rank_div = None credit_div = None head_pic_div = None user_atten_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view') and \ ("Pl_Official_LeftInfo__17" in text \ or "Pl_Official_Header__1" in text \ or "Pl_Official_RightGrow__17" in text \ or "Pl_Official_LeftInfo__36" in text \ or "Pl_Official_LeftInfo__41" in text \ or "Pl_Core_Header__1" in text \ ): text = text.replace('FM.view(', '')[:-1] if text.endswith(';'): text = text[:-1] data = json.loads(text) domid = data['domid'] if domid == 'Pl_Official_LeftInfo__17' or domid == 'Pl_Official_LeftInfo__36'\ or domid == 'Pl_Official_LeftInfo__41': info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid == 'Pl_Official_RightGrow__17': right_soup = beautiful_soup(data['html']) right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'}) for block_div in right_div.find_all('div', attrs={'class': 'info_block'}): block_title = block_div.find('form').text.strip() if block_title == u'等级信息': rank_div = block_div elif block_title == u'信用信息': credit_div = block_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_ul = header_soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_s'}) elif domid == 'Pl_Core_Header__1': core_header_soup = beautiful_soup(data['html']) head_div = core_header_soup.find('div', attrs={'class': 'pf_head S_bg5 S_line1'}) head_pic_div = head_div.find('div',attrs={'class': 'pf_head_pic'}) user_atten_div = head_div.find('div',attrs={'class': 'user_atten'}) elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoGrow': right_soup = beautiful_soup(data['html']) right_div = right_soup.find('div', attrs={'class': 'prm_app_pinfo'}) for block_div in right_div.find_all('div', attrs={'class': 'info_block'}): block_title = block_div.find('form').text.strip() if block_title == u'等级信息': rank_div = block_div elif block_title == u'信用信息': credit_div = block_div elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix user_atten_m'}) elif pid == 'pl_leftNav_profilePersonal': if weibo_user.info.avatar is None: soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('div',attrs={'class': 'face_infor'}).find('img')['src'] weibo_user.info.nickname = soup.find('div',attrs={'class': 'face_infor'}).find('a',attrs={'class': 'logo_img'})['title'] elif pid == 'pl_content_litePersonInfo': soup = beautiful_soup(data['html']) weibo_ul = soup.find('ul', attrs={'class': 'user_atten clearfix'}) profile_map = { u'昵称': {'field': 'nickname'}, u'真实姓名': {'field': 'realname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex'}, u'性取向': {'field': 'sex_dir'}, u'生日': {'field': 'birth'}, u'感情状况': {'field': 'love'}, u'血型': {'field': 'blood_type'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) rank_map = { u'当前等级': {'field': 'rank'}, u'活跃天数': {'field': 'active_day'}, } if rank_div is not None: for div in rank_div.find_all(attrs={'class': 'info'}): k = div.text.strip()[:4] v = div.find(attrs={'class': 'S_txt1 point'}).text.strip('LV') if k in rank_map: func = (lambda s: s) \ if 'func' not in rank_map[k] \ else rank_map[k]['func'] v = func(v) setattr(weibo_user.info, rank_map[k]['field'], v) credit_map = { u'信用等级': {'field': 'credit_rank'}, u'当前信用积分': {'field': 'credit'}, } if credit_div is not None: for div in credit_div.find_all(attrs={'class': 'info'}): if u'信用等级' in div.text.strip(): k = div.text.strip()[:4] v = div.find(attrs={'class': 'S_txt1'}).text.strip() else: k = div.text.strip()[:6] v = div.find(attrs={'class': 'S_txt1 point'}).text.strip() if k in credit_map: func = (lambda s: s) \ if 'func' not in credit_map[k] \ else credit_map[k]['func'] v = func(v) setattr(weibo_user.info, credit_map[k]['field'], v) weibo_map = { u'关注': {'field': 'follow_num'}, u'粉丝': {'field': 'fans_num'}, u'微博': {'field': 'weibo_num'}, } if weibo_ul is not None: for li in weibo_ul.find_all('li'): k = li.find('span').text.strip() v = li.find('strong').text.strip() if k in weibo_map: func = (lambda s: s) \ if 'func' not in weibo_map[k] \ else weibo_map[k]['func'] v = func(v) setattr(weibo_user.info, weibo_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) if head_pic_div is not None and weibo_user.info.avatar is None: weibo_user.info.avatar = head_pic_div.find('img')['src'] weibo_user.info.nickname = head_pic_div.find('img')['title'] if weibo_ul is None and user_atten_div is not None: for td in user_atten_div.find_all('td'): k = td.find('span').text.strip() v = td.find('strong').text.strip() if k in weibo_map: func = (lambda s: s) \ if 'func' not in weibo_map[k] \ else weibo_map[k]['func'] v = func(v) setattr(weibo_user.info, weibo_map[k]['field'], v) weibo_user.save() self.logger.debug('parse %s finish' % url) return [], []
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None next_urls = [] for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid)&Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']); mblog.content = content_div.text is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % ( name_a.text, text_a.text ) #mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) #ci # temp = parse(div.select('a.S_link2.WB_time')[0]['title']) tempstring = temp.strftime("%Y-%m-%d-%H-%M-%S") list=tempstring.split('-') tempyear=list[0] tempmonth=list[1] tempday=list[2] temphour=list[3] tempmin=list[4] tempsec=list[5] temptime=time.mktime(datetime(int(tempyear),int(tempmonth),int(tempday),int(temphour),int(tempmin),int(tempsec)).timetuple()) print temptime timevalue=open("D:\\09Limited_buffer\\earlywarningbyci\\cola\\contrib\\weibo\\timevalue.txt","r") time_re=timevalue.readline() timevalue.close() list=time_re.split() starttime=list[0] endtime=list[1] print starttime temptime=round(float(temptime)) starttime=round(float(starttime)) endtime=round(float(endtime)) if temptime>=starttime and temptime<=endtime: mblog.created = temp #timeok = True print "------OKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOKOK-----" else: if temptime<starttime: print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" time.sleep(5) return [], [] #continue # # if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break func_div = div.find_all('div', 'WB_func')[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={'action-type': action_type_re("like")}).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = func_div.find('a', attrs={'action-type': action_type_re("forward")}).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = func_div.find('a', attrs={'action-type': action_type_re('comment')}).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?"+map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {'mid': mid, '_t': 0, '__rnd': int(time.time()*1000)} query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str next_urls.append(like_url) mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] self.logger.debug('parse %s finish' % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append('%s?%s'%(url.split('?')[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) if not self.check(url, br): return weibo_user = self.get_weibo_user() params['_t'] = 0 params['__rnd'] = str(int(time.time() * 1000)) page = int(params.get('page', 1)) pre_page = int(params.get('pre_page', 0)) count = 15 if 'pagebar' not in params: params['pagebar'] = '0' pre_page += 1 elif params['pagebar'] == '0': params['pagebar'] = '1' elif params['pagebar'] == '1': del params['pagebar'] pre_page = page page += 1 count = 50 params['count'] = count params['page'] = page params['pre_page'] = pre_page data = json.loads(br.response().read())['data'] soup = beautiful_soup(data) finished = False divs = soup.find_all('div', attrs={'class': 'WB_feed_type'}, mid=True) max_id = None for div in divs: mid = div['mid'] if len(mid) == 0: continue max_id = mid if 'end_id' not in params: params['end_id'] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_content' }) for img in content_div.find_all("img", attrs={'type': 'face'}): img.replace_with(img['title']) mblog.content = content_div.text is_forward = div.get('isforward') == '1' if is_forward: mblog.omid = div['omid'] name_a = div.find('a', attrs={ 'class': 'WB_name', 'node-type': 'feed_list_originNick' }) text_a = div.find('div', attrs={ 'class': 'WB_text', 'node-type': 'feed_list_reason' }) if name_a is not None and text_a is not None: mblog.forward = '%s: %s' % (name_a.text, text_a.text) mblog.created = parse(div.select('a.S_link2.WB_time')[0]['title']) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and \ mblog.created <= weibo_user.last_update: finished = True break func_div = div.find_all('div', 'WB_func')[-1] action_type_re = lambda t: re.compile("^(feed_list|fl)_%s$" % t) likes = func_div.find('a', attrs={ 'action-type': action_type_re("like") }).text likes = likes.strip('(').strip(')') likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = func_div.find('a', attrs={ 'action-type': action_type_re("forward") }).text if '(' not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split('(', 1)[1].strip(')')) comments = func_div.find('a', attrs={ 'action-type': action_type_re('comment') }).text if '(' not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split('(', 1)[1].strip(')')) # fetch geo info map_info = div.find("div", attrs={'class': 'map_data'}) if map_info is not None: geo = Geo() geo.location = map_info.text.split('-')[0].strip() geo_info = urldecode("?" + map_info.find('a')['action-data'])['geo'] geo.longtitude, geo.latitude = tuple( [float(itm) for itm in geo_info.split(',', 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {'id': mid, '_t': 0, '__rnd': int(time.time() * 1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = 'http://weibo.com/aj/mblog/info/big?%s' % query_str yield forward_url if fetch_comment and mblog.n_comments > 0: comment_url = 'http://weibo.com/aj/comment/big?%s' % query_str yield comment_url if fetch_like and mblog.n_likes > 0: query = { 'mid': mid, '_t': 0, '__rnd': int(time.time() * 1000) } query_str = urllib.urlencode(query) like_url = 'http://weibo.com/aj/like/big?%s' % query_str yield like_url mblog.save() if 'pagebar' in params: params['max_id'] = max_id else: del params['max_id'] # self.logger.debug('parse %s finish' % url) # counter add one for the processed weibo list url self.counter.inc('processed_weibo_list_page', 1) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in weibo_user.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return yield '%s?%s' % (url.split('?')[0], urllib.urlencode(params))
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid.startswith('Pl_Official_LeftInfo__'): info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all( 'div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid.startswith('Pl_Official_PersonalInfo__'): new_style = True info_soup = beautiful_soup(data['html']) for block_div in info_soup.find_all( 'div', attrs={'class': 'WB_cardwrap'}): block_title_div = block_div.find( 'h4', attrs={'class': 'obj_name'}) if block_title_div is None: block_title_div = block_div.find('div', attrs={'class': 'obj_name'})\ .find('h2') if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find( 'div', attrs={'class': 'WB_innerwrap'}) if block_title == u'基本信息': profile_div = inner_div elif block_title == u'工作信息': career_div = inner_div elif block_title == u'教育信息': edu_div = inner_div elif block_title == u'标签信息': tags_div = inner_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] weibo_user.info.n_follows = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'follow'}).text) weibo_user.info.n_fans = int(header_soup.find('ul', attrs={'class': 'user_atten'})\ .find('strong', attrs={'node-type': 'fans'}).text) elif domid.startswith('Pl_Core_T8CustomTriColumn__'): # new style friends info header_soup = beautiful_soup(data['html']) tds = header_soup.find('table', attrs={'class': 'tb_counter'})\ .find_all('td') weibo_user.info.n_follows = int(tds[0].find('strong').text) weibo_user.info.n_fans = int(tds[1].find('strong').text) elif domid.startswith('Pl_Official_Headerv6__'): # new style avatar info header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('p', attrs='photo_wrap')\ .find('img')['src'] elif 'STK' in text: text = text.replace( 'STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': { 'field': 'nickname' }, u'所在地': { 'field': 'location' }, u'性别': { 'field': 'sex', 'func': lambda s: True if s == u'男' else False }, u'生日': { 'field': 'birth' }, u'博客': { 'field': 'blog' }, u'个性域名': { 'field': 'site' }, u'简介': { 'field': 'intro' }, u'邮箱': { 'field': 'email' }, u'QQ': { 'field': 'qq' }, u'MSN': { 'field': 'msn' } } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={'class': 'pf_item'}) else: divs = profile_div.find_all('li', attrs={'class': 'li_1'}) for div in divs: if not new_style: k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() else: k = div.find('span', attrs={ 'class': 'pt_title' }).text.strip().strip(u':') d = div.find('span', attrs={'class': 'pt_detail'}) if d: v = d.text.strip() else: v = div.find('a').text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split( '(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find('li', attrs={'class': 'li_1'}) for span in li.find_all('span', attrs={'class': 'pt_detail'}): work_info = WorkInfo() text = span.text a = span.find('a') if a is not None: work_info.name = a.text if '(' in text: work_info.date = text.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] for l in text.split('\r\n'): l = l.strip() if len(l) == 0: continue if l.startswith(u'地区:'): work_info.location = l.split(u':', 1)[1] elif l.startswith(u'职位:'): work_info.position = l.split(u':', 1)[1] else: work_info.detail = text.replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split( '(')[1].strip().strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find('li', attrs={'class': 'li_1'})\ .find('span', attrs={'class': 'pt_detail'}) text = span.text names = [] for a in span.find_all('a'): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if '(' in text: edu_info.date = t.strip().split('(')[1]\ .replace('\r', '')\ .replace('\n', '')\ .replace('\t', '')\ .split(')', 1)[0] t = t[t.find(')') + 1:] text = text[end_pos:] edu_info.detail = t.replace('\r', '').replace('\n', '')\ .replace('\t', '').strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) else: for a in tags_div.find('span', attrs={ 'class': 'pt_detail' }).find_all('a'): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc('processed_profile_page', 1)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all('script'): text = script.text if text.startswith('FM.view'): text = text.strip().replace(';', '').replace('FM.view(', '')[:-1] data = json.loads(text) domid = data['domid'] if domid == 'Pl_Official_LeftInfo__13': info_soup = beautiful_soup(data['html']) info_div = info_soup.find('div', attrs={'class': 'profile_pinfo'}) for block_div in info_div.find_all('div', attrs={'class': 'infoblock'}): block_title = block_div.find('form').text.strip() if block_title == u'基本信息': profile_div = block_div elif block_title == u'工作信息': career_div = block_div elif block_title == u'教育信息': edu_div = block_div elif block_title == u'标签信息': tags_div = block_div elif domid == 'Pl_Official_Header__1': header_soup = beautiful_soup(data['html']) weibo_user.info.avatar = header_soup.find('div', attrs={'class': 'pf_head_pic'})\ .find('img')['src'] elif 'STK' in text: text = text.replace('STK && STK.pageletM && STK.pageletM.view(', '')[:-1] data = json.loads(text) pid = data['pid'] if pid == 'pl_profile_infoBase': profile_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoCareer': career_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoEdu': edu_div = beautiful_soup(data['html']) elif pid == 'pl_profile_infoTag': tags_div = beautiful_soup(data['html']) elif pid == 'pl_profile_photo': soup = beautiful_soup(data['html']) weibo_user.info.avatar = soup.find('img')['src'] profile_map = { u'昵称': {'field': 'nickname'}, u'所在地': {'field': 'location'}, u'性别': {'field': 'sex', 'func': lambda s: True if s == u'男' else False}, u'生日': {'field': 'birth'}, u'博客': {'field': 'blog'}, u'个性域名': {'field': 'site'}, u'简介': {'field': 'intro'}, u'邮箱': {'field': 'email'}, u'QQ': {'field': 'qq'}, u'MSN': {'field': 'msn'} } if profile_div is not None: for div in profile_div.find_all(attrs={'class': 'pf_item'}): k = div.find(attrs={'class': 'label'}).text.strip() v = div.find(attrs={'class': 'con'}).text.strip() if k in profile_map: if k == u'个性域名' and '|' in v: v = v.split('|')[1].strip() func = (lambda s: s) \ if 'func' not in profile_map[k] \ else profile_map[k]['func'] v = func(v) setattr(weibo_user.info, profile_map[k]['field'], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={'class': 'con'}): work_info = WorkInfo() ps = div.find_all('p') for p in ps: a = p.find('a') if a is not None: work_info.name = a.text text = p.text if '(' in text: work_info.date = text.strip().split('(')[1].strip(')') else: text = p.text if text.startswith(u'地区:'): work_info.location = text.split(u':', 1)[1] elif text.startswith(u'职位:'): work_info.position = text.split(u':', 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={'class': 'con'}): edu_info = EduInfo() ps = div.find_all('p') for p in ps: a = p.find('a') text = p.text if a is not None: edu_info.name = a.text if '(' in text: edu_info.date = text.strip().split('(')[1].strip(')') else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={'class': 'con'}): for a in div.find_all('a'): weibo_user.info.tags.append(a.text) weibo_user.save() self.logger.debug('parse %s finish' % url) return [], []
def get_body(self, soup): for elem in soup.find_all(['script', 'link', 'style']): elem.extract() raw_html = unicode(soup.body or soup) cleaned = self._clean_attributes(raw_html) return beautiful_soup(cleaned)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url params = urldecode(url) br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() params["_t"] = 0 params["__rnd"] = str(int(time.time() * 1000)) page = int(params.get("page", 1)) pre_page = int(params.get("pre_page", 0)) count = 15 if "pagebar" not in params: params["pagebar"] = "0" pre_page += 1 elif params["pagebar"] == "0": params["pagebar"] = "1" elif params["pagebar"] == "1": del params["pagebar"] pre_page = page page += 1 count = 50 params["count"] = count params["page"] = page params["pre_page"] = pre_page data = json.loads(br.response().read())["data"] soup = beautiful_soup(data) finished = False divs = soup.find_all("div", attrs={"class": "WB_feed_type"}, mid=True) max_id = None next_urls = [] for div in divs: mid = div["mid"] if len(mid) == 0: continue max_id = mid if "end_id" not in params: params["end_id"] = mid if mid in weibo_user.newest_mids: finished = True break if len(self.bundle.newest_mids) < 3: self.bundle.newest_mids.append(mid) try: mblog = getattr(MicroBlog, "objects").get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) content_div = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_content"}) for img in content_div.find_all("img", attrs={"type": "face"}): img.replace_with(img["title"]) mblog.content = content_div.text is_forward = div.get("isforward") == "1" if is_forward: name_a = div.find("a", attrs={"class": "WB_name", "node-type": "feed_list_originNick"}) text_a = div.find("div", attrs={"class": "WB_text", "node-type": "feed_list_reason"}) if name_a is not None and text_a is not None: mblog.forward = "%s: %s" % (name_a.text, text_a.text) mblog.created = parse(div.select("a.S_link2.WB_time")[0]["title"]) if self.bundle.last_update is None or mblog.created > self.bundle.last_update: self.bundle.last_update = mblog.created if weibo_user.last_update is not None and mblog.created <= weibo_user.last_update: finished = True break likes = div.find("a", attrs={"action-type": "feed_list_like"}).text likes = likes.strip("(").strip(")") likes = 0 if len(likes) == 0 else int(likes) mblog.n_likes = likes forwards = div.find("a", attrs={"action-type": "feed_list_forward"}).text if "(" not in forwards: mblog.n_forwards = 0 else: mblog.n_forwards = int(forwards.strip().split("(", 1)[1].strip(")")) comments = div.find("a", attrs={"action-type": "feed_list_comment"}).text if "(" not in comments: mblog.n_comments = 0 else: mblog.n_comments = int(comments.strip().split("(", 1)[1].strip(")")) # fetch geo info map_info = div.find("div", attrs={"class": "map_data"}) if map_info is not None: geo = Geo() geo.location = map_info.text.split("-")[0].strip() geo_info = urldecode("?" + map_info.find("a")["action-data"])["geo"] geo.longtitude, geo.latitude = tuple([float(itm) for itm in geo_info.split(",", 1)]) mblog.geo = geo # fetch forwards and comments if fetch_forward or fetch_comment or fetch_like: query = {"id": mid, "_t": 0, "__rnd": int(time.time() * 1000)} query_str = urllib.urlencode(query) if fetch_forward and mblog.n_forwards > 0: forward_url = "http://weibo.com/aj/comment/big?%s" % query_str next_urls.append(forward_url) if fetch_comment and mblog.n_comments > 0: comment_url = "http://weibo.com/aj/mblog/info/big?%s" % query_str next_urls.append(comment_url) if fetch_like and mblog.n_likes > 0: query = {"mid": mid, "_t": 0, "__rnd": int(time.time() * 1000)} query_str = urllib.urlencode(query) like_url = "http://weibo.com/aj/like/big?%s" % query_str next_urls.append(like_url) mblog.save() if "pagebar" in params: params["max_id"] = max_id else: del params["max_id"] self.logger.debug("parse %s finish" % url) # if not has next page if len(divs) == 0 or finished: weibo_user = self.get_weibo_user() for mid in self.bundle.newest_mids: if mid not in self.bundle.newest_mids: weibo_user.newest_mids.append(mid) while len(weibo_user.newest_mids) > 3: weibo_user.newest_mids.pop() weibo_user.last_update = self.bundle.last_update weibo_user.save() return [], [] next_urls.append("%s?%s" % (url.split("?")[0], urllib.urlencode(params))) return next_urls, []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() new_style = False profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all("script"): text = script.text if text.startswith("FM.view"): text = text.strip().replace(";", "").replace("FM.view(", "")[:-1] data = json.loads(text) domid = data["domid"] if domid.startswith("Pl_Official_LeftInfo__"): info_soup = beautiful_soup(data["html"]) info_div = info_soup.find("div", attrs={"class": "profile_pinfo"}) for block_div in info_div.find_all("div", attrs={"class": "infoblock"}): block_title = block_div.find("form").text.strip() if block_title == u"基本信息": profile_div = block_div elif block_title == u"工作信息": career_div = block_div elif block_title == u"教育信息": edu_div = block_div elif block_title == u"标签信息": tags_div = block_div elif domid.startswith("Pl_Official_PersonalInfo__"): new_style = True info_soup = beautiful_soup(data["html"]) for block_div in info_soup.find_all("div", attrs={"class": "WB_cardwrap"}): block_title_div = block_div.find("h4", attrs={"class": "obj_name"}) if block_title_div is None: block_title_div = block_div.find("div", attrs={"class": "obj_name"}).find("h2") if block_title_div is None: continue block_title = block_title_div.text.strip() inner_div = block_div.find("div", attrs={"class": "WB_innerwrap"}) if block_title == u"基本信息": profile_div = inner_div elif block_title == u"工作信息": career_div = inner_div elif block_title == u"教育信息": edu_div = inner_div elif block_title == u"标签信息": tags_div = inner_div elif domid == "Pl_Official_Header__1": header_soup = beautiful_soup(data["html"]) weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"] weibo_user.info.n_follows = int( header_soup.find("ul", attrs={"class": "user_atten"}) .find("strong", attrs={"node-type": "follow"}) .text ) weibo_user.info.n_fans = int( header_soup.find("ul", attrs={"class": "user_atten"}) .find("strong", attrs={"node-type": "fans"}) .text ) elif domid.startswith("Pl_Core_T8CustomTriColumn__"): # new style friends info header_soup = beautiful_soup(data["html"]) tds = header_soup.find("table", attrs={"class": "tb_counter"}).find_all("td") weibo_user.info.n_follows = int(tds[0].find("strong").text) weibo_user.info.n_fans = int(tds[1].find("strong").text) elif domid.startswith("Pl_Official_Headerv6__"): # new style avatar info header_soup = beautiful_soup(data["html"]) weibo_user.info.avatar = header_soup.find("p", attrs="photo_wrap").find("img")["src"] elif "STK" in text: text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1] data = json.loads(text) pid = data["pid"] if pid == "pl_profile_infoBase": profile_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoCareer": career_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoEdu": edu_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoTag": tags_div = beautiful_soup(data["html"]) elif pid == "pl_profile_photo": soup = beautiful_soup(data["html"]) weibo_user.info.avatar = soup.find("img")["src"] profile_map = { u"昵称": {"field": "nickname"}, u"所在地": {"field": "location"}, u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False}, u"生日": {"field": "birth"}, u"博客": {"field": "blog"}, u"个性域名": {"field": "site"}, u"简介": {"field": "intro"}, u"邮箱": {"field": "email"}, u"QQ": {"field": "qq"}, u"MSN": {"field": "msn"}, } if profile_div is not None: if not new_style: divs = profile_div.find_all(attrs={"class": "pf_item"}) else: divs = profile_div.find_all("li", attrs={"class": "li_1"}) for div in divs: if not new_style: k = div.find(attrs={"class": "label"}).text.strip() v = div.find(attrs={"class": "con"}).text.strip() else: k = div.find("span", attrs={"class": "pt_title"}).text.strip().strip(u":") d = div.find("span", attrs={"class": "pt_detail"}) if d: v = d.text.strip() else: v = div.find("a").text.strip() if k in profile_map: if k == u"个性域名" and "|" in v: v = v.split("|")[1].strip() func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"] v = func(v) setattr(weibo_user.info, profile_map[k]["field"], v) weibo_user.info.work = [] if career_div is not None: if not new_style: for div in career_div.find_all(attrs={"class": "con"}): work_info = WorkInfo() ps = div.find_all("p") for p in ps: a = p.find("a") if a is not None: work_info.name = a.text text = p.text if "(" in text: work_info.date = text.strip().split("(")[1].strip(")") else: text = p.text if text.startswith(u"地区:"): work_info.location = text.split(u":", 1)[1] elif text.startswith(u"职位:"): work_info.position = text.split(u":", 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) else: li = career_div.find("li", attrs={"class": "li_1"}) for span in li.find_all("span", attrs={"class": "pt_detail"}): work_info = WorkInfo() text = span.text a = span.find("a") if a is not None: work_info.name = a.text if "(" in text: work_info.date = ( text.strip() .split("(")[1] .replace("\r", "") .replace("\n", "") .replace("\t", "") .split(")", 1)[0] ) for l in text.split("\r\n"): l = l.strip() if len(l) == 0: continue if l.startswith(u"地区:"): work_info.location = l.split(u":", 1)[1] elif l.startswith(u"职位:"): work_info.position = l.split(u":", 1)[1] else: work_info.detail = text.replace("\r", "").replace("\n", "").replace("\t", "").strip() weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: if not new_style: for div in edu_div.find_all(attrs={"class": "con"}): edu_info = EduInfo() ps = div.find_all("p") for p in ps: a = p.find("a") text = p.text if a is not None: edu_info.name = a.text if "(" in text: edu_info.date = text.strip().split("(")[1].strip().strip(")") else: edu_info.detail = text weibo_user.info.edu.append(edu_info) else: span = edu_div.find("li", attrs={"class": "li_1"}).find("span", attrs={"class": "pt_detail"}) text = span.text names = [] for a in span.find_all("a"): names.append(a.text) for idx, name in enumerate(names): start_pos = text.find(name) + len(name) if idx < len(names) - 1: end_pos = text.find(names[idx + 1], start_pos) else: end_pos = len(text) t = text[start_pos:end_pos] edu_info = EduInfo() edu_info.name = name if "(" in text: edu_info.date = ( t.strip() .split("(")[1] .replace("\r", "") .replace("\n", "") .replace("\t", "") .split(")", 1)[0] ) t = t[t.find(")") + 1 :] text = text[end_pos:] edu_info.detail = t.replace("\r", "").replace("\n", "").replace("\t", "").strip() weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: if not new_style: for div in tags_div.find_all(attrs={"class": "con"}): for a in div.find_all("a"): weibo_user.info.tags.append(a.text) else: for a in tags_div.find("span", attrs={"class": "pt_detail"}).find_all("a"): weibo_user.info.tags.append(a.text.strip()) weibo_user.save() # self.logger.debug('parse %s finish' % url) # counter add one for the profile url self.counter.inc("processed_profile_page", 1)
def parse(self, url=None): if self.bundle.exists == False: return [], [] url = url or self.url br = self.opener.browse_open(url) self.logger.debug("load %s finish" % url) soup = beautiful_soup(br.response().read()) if not self.check(url, br): return [], [] weibo_user = self.get_weibo_user() info = weibo_user.info if info is None: weibo_user.info = UserInfo() profile_div = None career_div = None edu_div = None tags_div = None for script in soup.find_all("script"): text = script.text if "FM.view" in text: text = text.replace("FM.view(", "")[:-1] data = json.loads(text) domid = data["domid"] if domid == "Pl_Official_LeftInfo__13": info_soup = beautiful_soup(data["html"]) info_div = info_soup.find("div", attrs={"class": "profile_pinfo"}) for block_div in info_div.find_all("div", attrs={"class": "infoblock"}): block_title = block_div.find("form").text.strip() if block_title == u"基本信息": profile_div = block_div elif block_title == u"工作信息": career_div = block_div elif block_title == u"教育信息": edu_div = block_div elif block_title == u"标签信息": tags_div = block_div elif domid == "Pl_Official_Header__1": header_soup = beautiful_soup(data["html"]) weibo_user.info.avatar = header_soup.find("div", attrs={"class": "pf_head_pic"}).find("img")["src"] elif "STK" in text: text = text.replace("STK && STK.pageletM && STK.pageletM.view(", "")[:-1] data = json.loads(text) pid = data["pid"] if pid == "pl_profile_infoBase": profile_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoCareer": career_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoEdu": edu_div = beautiful_soup(data["html"]) elif pid == "pl_profile_infoTag": tags_div = beautiful_soup(data["html"]) elif pid == "pl_profile_photo": soup = beautiful_soup(data["html"]) weibo_user.info.avatar = soup.find("img")["src"] profile_map = { u"昵称": {"field": "nickname"}, u"所在地": {"field": "location"}, u"性别": {"field": "sex", "func": lambda s: True if s == u"男" else False}, u"生日": {"field": "birth"}, u"博客": {"field": "blog"}, u"个性域名": {"field": "site"}, u"简介": {"field": "intro"}, u"邮箱": {"field": "email"}, u"QQ": {"field": "qq"}, u"MSN": {"field": "msn"}, } if profile_div is not None: for div in profile_div.find_all(attrs={"class": "pf_item"}): k = div.find(attrs={"class": "label"}).text.strip() v = div.find(attrs={"class": "con"}).text.strip() if k in profile_map: if k == u"个性域名" and "|" in v: v = v.split("|")[1].strip() func = (lambda s: s) if "func" not in profile_map[k] else profile_map[k]["func"] v = func(v) setattr(weibo_user.info, profile_map[k]["field"], v) weibo_user.info.work = [] if career_div is not None: for div in career_div.find_all(attrs={"class": "con"}): work_info = WorkInfo() ps = div.find_all("p") for p in ps: a = p.find("a") if a is not None: work_info.name = a.text text = p.text if "(" in text: work_info.date = text.strip().split("(")[1].strip(")") else: text = p.text if text.startswith(u"地区:"): work_info.location = text.split(u":", 1)[1] elif text.startswith(u"职位:"): work_info.position = text.split(u":", 1)[1] else: work_info.detail = text weibo_user.info.work.append(work_info) weibo_user.info.edu = [] if edu_div is not None: for div in edu_div.find_all(attrs={"class": "con"}): edu_info = EduInfo() ps = div.find_all("p") for p in ps: a = p.find("a") text = p.text if a is not None: edu_info.name = a.text if "(" in text: edu_info.date = text.strip().split("(")[1].strip(")") else: edu_info.detail = text weibo_user.info.edu.append(edu_info) weibo_user.info.tags = [] if tags_div is not None: for div in tags_div.find_all(attrs={"class": "con"}): for a in div.find_all("a"): weibo_user.info.tags.append(a.text) weibo_user.save() self.logger.debug("parse %s finish" % url) return [], []
def parse(self, url=None): if self.bundle.exists is False: return url = url or self.url br = self.opener.browse_open(url) jsn = json.loads(br.response().read()) # self.logger.debug('load %s finish' % url) soup = beautiful_soup(jsn['data']['html']) current_page = jsn['data']['page']['pagenum'] n_pages = jsn['data']['page']['totalpage'] if not self.check(url, br): return decodes = urldecode(url) mid = decodes.get('id', decodes.get('mid')) mblog = self.bundle.current_mblog if mblog is None or mblog.mid != mid: try: mblog = getattr(MicroBlog, 'objects').get(Q(mid=mid) & Q(uid=self.uid)) except DoesNotExist: mblog = MicroBlog(mid=mid, uid=self.uid) mblog.save() def set_instance(instance, dl): instance.avatar = dl.find('dt').find('img')['src'] date = dl.find('dd').find(attrs={'class': 'S_txt2'}).text date = date.strip().strip('(').strip(')') instance.created = self.parse_datetime(date) for div in dl.find_all('div'): div.extract() for span in dl.find_all('span'): span.extract() instance.content = dl.text.strip() counter_type = None if url.startswith('http://weibo.com/aj/comment'): counter_type = 'comment' dls = soup.find_all('dl', mid=True) for dl in dls: uid = dl.find('a', usercard=True)['usercard'].split("id=", 1)[1] comment = Comment(uid=uid) set_instance(comment, dl) mblog.comments.append(comment) elif url.startswith('http://weibo.com/aj/mblog/info'): counter_type = 'forward' dls = soup.find_all('dl', mid=True) for dl in dls: forward_again_a = dl.find( 'a', attrs={ 'action-type': re.compile("^(feed_list|fl)_forward$") }) uid = urldecode('?%s' % forward_again_a['action-data'])['uid'] forward = Forward(uid=uid, mid=dl['mid']) set_instance(forward, dl) mblog.forwards.append(forward) elif url.startswith('http://weibo.com/aj/like'): counter_type = 'like' lis = soup.find_all('li', uid=True) for li in lis: like = Like(uid=li['uid']) like.avatar = li.find('img')['src'] mblog.likes.append(like) mblog.save() # self.logger.debug('parse %s finish' % url) # counter add one for the processed forward or comment or like list url if counter_type is not None: self.counter.inc('processed_%s_list_page' % counter_type, 1) if current_page >= n_pages: return params = urldecode(url) new_params = urldecode('?page=%s' % (current_page + 1)) params.update(new_params) params['__rnd'] = int(time.time() * 1000) next_page = '%s?%s' % (url.split('?')[0], urllib.urlencode(params)) yield next_page