def _parse_user(self, response): try: user_item = UserItem() _html = response.text _json = response.xpath( '''/html/script[starts-with(text(),'FM.view({"ns":"pl.header.preloginHead.index",''' '''"domid":"Pl_Official_Headerv6') or starts-with(text(),'FM.view({"ns":"pl.header.head.index",''' '''"domid":"Pl_Official_Headerv6')]/text()''')[0].extract() _html = JsonUtil.jsonp_to_html(_json) _html_ele = HtmlResponse(url=response.url, encoding='utf-8', body=_html) user_item['nickname'] = _html_ele.xpath( './descendant::h1[@class="username"]/text()')[0].extract() gender_class = \ _html_ele.xpath('./descendant::i[@class="W_icon icon_pf_female" or @class="W_icon icon_pf_male"]/' '@class')[0].extract() user_item['gender'] = gender_class[gender_class.rindex('_') + 1:] # vip6的样式W_icon icon_member6 user_item['is_vip'] = len( _html_ele.xpath( './descendant::a[@href="http://vip.weibo.com/personal?' 'from=main"]/em[not(contains(@class,"icon_member_dis"))]'). extract()) > 0 user_item['verified'] = len( _html_ele.xpath( './descendant::div[@class="pf_photo"]/a').extract()) > 0 user_item['introduction'] = _html_ele.xpath( './descendant::div[@class="pf_intro" and 2]/text()' )[0].extract().strip() _json = response.xpath( '''/html/script[starts-with(text(),'FM.view({"ns":"pl.content.homeFeed.index",''' '''"domid":"Pl_Core_UserInfo')]/text()''').extract()[0] _html = JsonUtil.jsonp_to_html(_json) _html_ele = HtmlResponse(url=response.url, encoding='utf-8', body=_html) level_text = _html_ele.xpath( './descendant::a/span/text()')[0].extract() user_item['level'] = int(level_text[level_text.index('.') + 1:]) _json = response.xpath( '''/html/script[starts-with(text(),'FM.view({"ns":"","domid":''' '''"Pl_Core_T8CustomTriColumn')]''').extract()[0] _html = JsonUtil.jsonp_to_html(_json) _html_ele = HtmlResponse(url=response.url, encoding='utf-8', body=_html) nums = _html_ele.xpath( './descendant::td/descendant::strong/text()').extract() user_item['concern_num'] = int(nums[0]) user_item['fans_num'] = int(nums[1]) user_item['weibo_num'] = int(nums[2]) user_item['home_url'] = response.url[:response.url. index('?') if response.url. count('?') else None] Spider.log(self, 'user_item: %s' % user_item, level=logging.INFO) return user_item except: Spider.log(self, "%s\n%s" % (response.url, _html), logging.ERROR) traceback.print_exc()
def _parse_div_list_v2(self, div_list_v2): for i in div_list_v2: weibo_item = WeiboItem() weibo_item['mid'] = i.xpath('./@mid')[0].extract() weibo_item['nickname'] = \ i.xpath('./div[@class="list_des"]/div[@class="subinfo_box clearfix"]/a[2]/span/text()')[0].extract() date_str = \ i.xpath( './div[@class="list_des"]/div[@class="subinfo_box clearfix"]/span[@class="subinfo S_txt2"]/' 'text()')[0].extract() weibo_item['date'] = self.__process_datestr(date_str) content_div = i.xpath( './div[@class="list_des"]/*[1]/*/descendant-or-self::text()' ).extract() weibo_item['content'] = ''.join(content_div) weibo_item['source_url'] = 'http:' + i.xpath( './div[@class="vid"]/@href')[0].extract() weibo_item['image_urls'] = None action_data = i.xpath( './div[@class="vid"]/@action-data')[0].extract() video_src = action_data[action_data.index('video_src=') + 10:action_data.index('&cover_img=')] weibo_item['video_url'] = parse.unquote(video_src) nums = i.xpath( './div[@class="list_des"]/div[@class="subinfo_box clearfix subinfo_box_btm"]/span[@class="subinfo_rgt ' 'S_txt2"]/em[2]/text()')[0].extract() weibo_item['forwarding_num'] = int(nums[-1]) weibo_item['comment_num'] = int(nums[-2]) weibo_item['praise_num'] = int(nums[-3]) Spider.log(self, weibo_item) yield weibo_item
def _parse_div_list_b(self, div_list_b): for i in div_list_b: weibo_item = WeiboItem() weibo_item['mid'] = i.xpath('./@mid')[0].extract() weibo_item['nickname'] = \ i.xpath('./div[@class="list_des"]/div[@class="subinfo_box clearfix"]/a[2]/span/text()')[0].extract() date_str = \ i.xpath( './div[@class="list_des"]/div[@class="subinfo_box clearfix"]/span[@class="subinfo S_txt2"]/text()')[ 0].extract() weibo_item['date'] = self.__process_datestr(date_str) content_div = i.xpath( './div[@class="list_des"]/*[1]/*/descendant-or-self::text()' ).extract() weibo_item['content'] = ''.join(content_div) weibo_item['source_url'] = 'http:' + i.xpath( './@href')[0].extract() weibo_item['image_urls'] = i.xpath('./div[1]/img/@src').extract() weibo_item['video_url'] = None nums = i.xpath( './div[@class="list_des"]/div[@class="subinfo_box clearfix"]/span[@class="subinfo_rgt S_txt2"]/em[' '2]/text()').extract() weibo_item['forwarding_num'] = int(nums[-1]) weibo_item['comment_num'] = int(nums[-2]) weibo_item['praise_num'] = int(nums[-3]) Spider.log(self, weibo_item) yield weibo_item
def process_request(self, request: Request, spider: Spider) -> None: if self.items_scraped >= self.max_count: spider.log('Changing Tor IP...') self.items_scraped = 0 new_ip = self.tc.renew_ip() if not new_ip: raise Exception('FatalError: Failed to find a new IP') spider.log(f'New Tor IP: {new_ip}') # http://127.0.0.1:8118 is the default address for Privoxy request.meta['proxy'] = 'http://127.0.0.1:8118' self.items_scraped += 1
def process_request(self, request: Request, spider: Spider): if self.items_scraped >= self.limit_usage: spider.log( f'Changing user-agent "{self.user_agent}" after {self.limit_usage} requests' ) self.items_scraped = 0 self.limit_usage = random.randint(self.min_usage, self.max_usage) self.user_agent = next(self.user_agents) spider.log( f'User-agent changed to "{self.user_agent}". A new user-agent will be chosen after {self.limit_usage} requests' ) request.headers['user-agent'] = self.user_agent self.items_scraped += 1
def wrapper(self, item: Dict, spider: Spider) -> Any: # type: ignore # message template for debugging msg = "%%s %s pipeline step" % (self.__class__.__name__, ) pipelines = set([]) if hasattr(spider, "pipelines"): if type(spider.pipelines) is set: pipelines |= spider.pipelines if hasattr(spider, "pipelines_extra"): if type(spider.pipelines_extra) is set: pipelines |= spider.pipelines_extra if self.__class__ in pipelines: spider.log(msg % "Executing", level=logging.INFO) return process_item_method(self, item, spider) else: # spider.log(msg % "skipping", level=logging.DEBUG) return item