def parse_shop(self, response): """ 解析提取model数据 :param response: """ item = Item() item_loader = ItemLoader(item=item, selector=response) for field in self.model_xpath: item.fields[field] = Field() if 'model_url' in field: item_loader.add_value(field, response.url) else: item_loader.add_xpath(field, self.model_xpath[field]) item = self.format_item(item_loader.load_item()) yield item # 拼接用户评论链接 user_url = response.xpath(self.user_url).extract() for uid in user_url: brand = uid.split('-')[0] model = uid.split('-')[1] yield Request( self.user_review_url.format(brand=brand, model=model), self.parse_comment)
def parse_forum(self, response): """ 解析评论内容 """ item = Item() itemloader = ItemLoader(item=item, selector=response) for field in self.forum_xpath: item.fields[field] = Field() if 'main_body' in field: content = re.compile(r'<html><body><.*?>(.*?)</body></html>', re.S | re.M) content = content.findall(response.text) content = re.sub(r'<script>.*?</script>', '', ''.join(content)) content = re.sub(r'[\r\n]', '', content) content = re.sub(r'<div .*?>.*?</div>', '', content) content = re.sub(r'<style .*?>.*?</style>', '', content, re.S | re.M) content = re.sub(r'&.*?;', '', content) content = re.sub(r'<.*?>', '', content, re.M | re.I) content = re.sub(' ', '', content) itemloader.add_value(field, content) elif 'content_url' in field: itemloader.add_value(field, response.url) else: itemloader.add_xpath(field, self.forum_xpath[field]) item = self.format_item(itemloader.load_item()) yield item
def test_add_stats_item_scraped_count_by_item_type(spider): for _ in range(15): spider.crawler.signals.send_catch_log_deferred( signal=signals.item_scraped, item={"_type": "regular_dict"}, response="", spider=spider, ) for _ in range(20): spider.crawler.signals.send_catch_log_deferred( signal=signals.item_scraped, item=Item(), response="", spider=spider, ) for _ in range(25): spider.crawler.signals.send_catch_log_deferred( signal=signals.item_scraped, item=TestItem(), response="", spider=spider, ) stats = spider.crawler.stats.get_stats() assert stats.get("spidermon_item_scraped_count") == 60 assert stats.get("spidermon_item_scraped_count/dict") == 15 assert stats.get("spidermon_item_scraped_count/Item") == 20 assert stats.get("spidermon_item_scraped_count/TestItem") == 25
def PlayerRow(**fields): item = Item() for field in all_fields: item.fields[field] = Field() for field, value in fields.iteritems(): item[field] = value return item
def parse_user(self, response): """ 根据返回的 response 进行数据解析 :param response: scrapy 框架返回的响应 """ result = { 'user_name': response.meta['user_name'], 'title': response.meta['title'], 'date': response.meta['date'], 'main_body': response.meta['main_body'], 'content_url': response.meta['content_url'], 'brand': response.meta['brand'] } for content in response.xpath(self.user_list_xpath): item = Item() item_loader = ItemLoader(item=item, selector=content) for field in self.user_xpath: item.fields[field] = Field() if 'user_url' in field: item_loader.add_value(field, response.url) else: item_loader.add_xpath(field, self.user_xpath[field]) result.update(item_loader.load_item()) item = self.format_item(result) yield item # 用户评论 user_comment = response.xpath('.//ul/li[@class="Comments"]/a/@href').extract() if user_comment: yield Request(self.url + user_comment[0], self.parse_comment)
def parse_item(self, response): item = Item() item.fields['url'] = Field() item.fields['url_md5'] = Field() item.fields['title'] = Field() item.fields['pubtime'] = Field() item.fields['content'] = Field() item.fields['author'] = Field() item.fields['site_name'] = Field() l = ItemLoader(item=item, response=response) url = response.url md5 = hashlib.md5() l.add_value(u'url', url) md5.update(url) url_md5 = md5.hexdigest() l.add_value(u'url_md5', url_md5) l.add_xpath('title', "//h2/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('pubtime', "//p[@class='time']/span[1]/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('content', "//div[@class='content']//text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('author', "//p[@class='time']/span[last()]/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_value("site_name", u"网易号") yield l.load_item()
def parse(self, response): """ 根据返回的 response 进行数据解析 :param response: scrapy 框架返回的响应 """ keyword = response.meta['keyword'] for complaint in response.xpath(self.list_xpath): item = Item() item_loader = ItemLoader(item=item, selector=complaint) for field in self.content_xpath: item.fields[field] = Field() if 'content_url' in field: item_loader.add_value(field, response.url) elif 'brand' in field: item_loader.add_value(field, keyword) else: item_loader.add_xpath(field, self.content_xpath[field]) # 用户链接 user_id = complaint.xpath('.//span[@class="Author"]/a/@href').extract() for uid in user_id: yield Request(self.url + uid, self.parse_user, meta=dict(item_loader.load_item())) # 下一页 next_page = response.xpath('//div[@id="PagerBefore"]/a[last()]/@href').extract() if next_page: yield Request(self.url + next_page[0], meta={'keyword': keyword}, callback=self.parse)
def parse(self, response): item = Item() sel = Selector(response) fields = json.loads(self.conf.get("fields")) rules = json.loads(self.conf.get("rules")) loops = rules.get("rules").get("rules_listxpath") if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") yield item item.fields["url"] = Field() item.fields["spider_jobid"] = Field() item["spider_jobid"] = self.spider_jobid item.fields['word'] = Field() item['word'] = response.meta.get("word") # 加载动态库字段建立Field,xpath规则 (方法一) for loop in sel.xpath("{}".format(loops)): item['url'] = loop.xpath(u"{}".format(fields.get("fields").get("url").get("xpath"))).extract() for k in loadMySQL(self.conf.get("spider_name")): if fields.get("fields").get(k[2]) != None: item.fields[k[2]] = Field() if fields.get("fields").get(k[2]).keys()[0] == "xpath": item[k[2]] = loop.xpath(u"{}".format(fields.get("fields").get(k[2]).get("xpath"))).extract() elif fields.get("fields").get(k[2]).keys()[0] == "value": item[k[2]] = u"{}".format(fields.get("fields").get(k[2]).get("value")) yield item
class PostItem(Item): title=Field() content=Field() url=Field() user=Item() fromurl=Field() postdate=Field()
def describe_request_result_handling(): single_request = Request("http://test.com") mixed_requests = [ Request("http://test.com"), FormRequest("http://test2.com") ] complete_mix = mixed_requests + [ Item() ] def it_can_extract_request_objects(): assert requests_in_parse_result([single_request]) == [single_request] def it_tolerates_None(): assert requests_in_parse_result(None) == [] assert items_in_parse_result(None) == [] assert count_requests_in_parse_result(None) == 0 assert count_items_in_parse_result(None) == 0 def it_tolerates_single_elements(): assert requests_in_parse_result(single_request) == [single_request] assert items_in_parse_result(single_request) == [] def it_tolerates_and_sorts_out_items_mixed_in_between(): assert requests_in_parse_result(complete_mix) == mixed_requests def it_tolerates_different_request_types(): assert requests_in_parse_result(mixed_requests) == mixed_requests def it_extracts_urls_from_requests(): urls = urls_from_requests(complete_mix) assert len(urls) == 2 assert "http://test.com" in urls assert "http://test2.com" in urls def it_counts_the_requests_and_other_results(): assert count_requests_in_parse_result(complete_mix) == 2 assert count_items_in_parse_result(complete_mix) == 1
def parse(self, response): item = Item() fields = json.loads(self.conf.get("fields")) l = ItemLoader(item, response) item.fields["url"] = Field() item.fields["spider_jobid"] = Field() l.add_value("url", response.url) l.add_value("spider_jobid", self.spider_jobid) # 加载动态库字段建立Field,xpath规则 (方法一) for k in self.keys: if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") return l.load_item() if fields.get("fields").get(k) != None: item.fields[k] = Field() if fields.get("fields").get(k).keys()[0] == "xpath": l.add_xpath( k, u"{}".format(fields.get("fields").get(k).get("xpath")), MapCompose(unicode.strip)) elif fields.get("fields").get(k).keys()[0] == "value": if fields.get("fields").get(k).get("value") == u"{TODAY}": l.add_value(k, u"{}".format(datetime.now())) l.add_value( k, u"{}".format(fields.get("fields").get(k).get("value"))) return l.load_item()
def parse_item(self, response): item = Item() word = response.meta['word'] fields = json.loads(self.conf.get("fields")) l = ItemLoader(item, response) if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") return l.load_item() item.fields["url"] = Field() item.fields["spider_jobid"] = Field() l.add_value("url", response.url) l.add_value("spider_jobid", self.spider_jobid) item.fields['word'] = Field() l.add_value('word', word) # 加载动态库字段建立Field,xpath规则 (方法一) for k in loadMySQL(self.name_spider)['fields'].keys(): if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") return l.load_item() if fields.get("fields").get(k) != None: item.fields[k] = Field() if fields.get("fields").get(k).keys()[0] == "xpath": l.add_xpath( k, u"{}".format(fields.get("fields").get(k).get("xpath")), MapCompose(unicode.strip)) elif fields.get("fields").get(k).keys()[0] == "value": l.add_value( k, u"{}".format(fields.get("fields").get(k).get("value"))) return l.load_item()
def parse(self, response): """ 解析网页获取评论 :param response:响应内容 """ search = response.meta['keyword'] # 评论页 for complaint in response.xpath(self.list_xpath): item = Item() item_loader = ItemLoader(item=item, selector=complaint) for field in self.content_xpath: item.fields[field] = Field() if 'content_url' in field: item_loader.add_value(field, response.url) elif 'brand' in field: item_loader.add_value(field, search) else: item_loader.add_xpath(field, self.content_xpath[field]) # 拼接user_url链接 uid = complaint.xpath('.//tr/td[@class="small"]/a/@href').extract() yield Request(self.urls + uid[0], self.parse_user, meta=dict(item_loader.load_item()), dont_filter=True) # 内容下一页 next_page = response.xpath( '//div[@class="pagelinks"]/a[last()]/@href').extract() for page in next_page: yield Request(self.urls + page, self.parse, meta={'keyword': search})
def parse_item(self, response): item = Item() item.fields['url'] = Field() item.fields['url_md5'] = Field() item.fields['title'] = Field() item.fields['pubtime'] = Field() item.fields['content'] = Field() item.fields['author'] = Field() item.fields['site_name'] = Field() l = ItemLoader(item=item, response=response) url = response.url url = url.replace("http://rym.quwenge.com/baidu_tiaozhuan.php?url=", "") md5 = hashlib.md5() l.add_value(u'url', url) md5.update(url) url_md5 = md5.hexdigest() l.add_value(u'url_md5', url_md5) l.add_xpath('title', "//h1/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('pubtime', "//span[@class='read']/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('content', "//div[@id='content']//text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('author', "//div[@class='name']/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_value("site_name", u"百家号") yield l.load_item()
def parse_item(self, response): item = Item() item.fields['url'] = Field() item.fields['url_md5'] = Field() item.fields['title'] = Field() item.fields['pubtime'] = Field() item.fields['content'] = Field() item.fields['author'] = Field() item.fields['author_url'] = Field() item.fields['site_name'] = Field() l = ItemLoader(item=item, response=response) url = response.url md5 = hashlib.md5() l.add_value(u'url', url) md5.update(url) url_md5 = md5.hexdigest() l.add_value(u'url_md5', url_md5) l.add_xpath('title', "//h1/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath( 'pubtime', "//p[@class='clearfix']//span/text() | //div[@class='titleLine-gY7DniPB']//span/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('content', "//div[@class='text-3zQ3cZD4']//text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath( 'author', "//p[@class='clearfix']/a[2]/text() | //div[@class='titleLine-gY7DniPB']/p/a[2]/text()", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_xpath('author_url', "//div[@class='titleLine-gY7DniPB']/p/a[2]/@href", MapCompose(unicode.lstrip, unicode.rstrip)) l.add_value("site_name", u"大风号") yield l.load_item()
def test_mpm_middleware(self): # create fake response a = MagicMock() a.meta = {'key1': 'value1', 'key2': 'value2'} yield_count = 0 # test all types of results from a spider # dicts, items, or requests test_list = [{}, Item(), Request('http://istresearch.com')] for item in self.mpm.process_spider_output(a, test_list, MagicMock()): if isinstance(item, Request): self.assertEquals(a.meta, item.meta) yield_count += 1 self.assertEquals(yield_count, 3) # 1 debug for the method, 1 debug for the request self.assertEquals(self.mpm.logger.debug.call_count, 2) # test meta unchanged if already exists r = Request('http://aol.com') r.meta['key1'] = 'othervalue' for item in self.mpm.process_spider_output(a, [r], MagicMock()): # key1 value1 did not pass through, since it was already set self.assertEquals(item.meta['key1'], 'othervalue') # key2 was not set, therefor it passed through self.assertEquals(item.meta['key2'], 'value2')
def test_process_not_trackable(self): pipeline = ItemTrackerPipeline.from_crawler(self.crawler) pipeline.storage = mock.Mock() expected = Item() found = pipeline.process_item(expected, self.spider) self.assertEqual(expected, found) pipeline.storage.assert_not_called()
def second_parse(self, response): href_list = response.xpath('//ul[@class="wp100"]/li//div/p[@class="fs22"]/a/@href') print(len(href_list), response.url) for href in href_list: item = Item() next_url = 'http://www.sinyi.com.cn/' + href.extract().split('/', 1)[1].split('&cookieuid=')[0] item.fields['HouseUrl'] = Field() item['HouseUrl'] = next_url yield scrapy.Request(next_url, callback=self.third_parse, meta={'item': item})
def housing_handle(self, response): item = response.meta.get("item") item1 = Item() PropertyAddress = response.xpath( "//div[@class='detailDesc']/text()").extract_first() PriceUnit = response.xpath( "//span[@class='xiaoquUnitPrice']/text()").extract_first() detail_community = response.xpath("//div[@class='xiaoquInfo']") BuildedTime = detail_community.xpath( "./div[@class='xiaoquInfoItem'][1]/span[@class='xiaoquInfoContent']/text()" ).extract_first() BuildingType = detail_community.xpath( "./div[@class='xiaoquInfoItem'][2]/span[@class='xiaoquInfoContent']/text()" ).extract_first() PropertyFee = detail_community.xpath( "./div[@class='xiaoquInfoItem'][3]/span[@class='xiaoquInfoContent']/text()" ).extract_first() PropertyCompany = detail_community.xpath( "./div[@class='xiaoquInfoItem'][4]/span[@class='xiaoquInfoContent']/text()" ).extract_first() Developers = detail_community.xpath( "./div[@class='xiaoquInfoItem'][5]/span[@class='xiaoquInfoContent']/text()" ).extract_first() TotalBuilding = detail_community.xpath( "./div[@class='xiaoquInfoItem'][6]/span[@class='xiaoquInfoContent']/text()" ).extract_first() TotalHouseholds = detail_community.xpath( "./div[@class='xiaoquInfoItem'][7]/span[@class='xiaoquInfoContent']/text()" ).extract_first() NearbyStores = detail_community.xpath( "./div[@class='xiaoquInfoItem'][8]/span[@class='xiaoquInfoContent']" ).xpath("string(.)").extract_first() item1.fields["PropertyAddress"] = Field() item1["PropertyAddress"] = PropertyAddress item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = PriceUnit item1.fields["BuildedTime"] = Field() item1["BuildedTime"] = BuildedTime item1.fields["BuildingType"] = Field() item1["BuildingType"] = BuildingType item1.fields["PropertyFee"] = Field() item1["PropertyFee"] = PropertyFee item1.fields["PropertyCompany"] = Field() item1["PropertyCompany"] = PropertyCompany item1.fields["Developers"] = Field() item1["Developers"] = Developers item1.fields["TotalBuilding"] = Field() item1["TotalBuilding"] = TotalBuilding item1.fields["TotalHouseholds"] = Field() item1["TotalHouseholds"] = TotalHouseholds item1.fields["NearbyStores"] = Field() item1["NearbyStores"] = NearbyStores item1.update(item) yield item1
def setUp(self): self.url = 'http://localhost' self.kwargs = {'url': self.url, 'dont_filter': True} self.crawler = MagicMock() self.spider = MetaSpider.from_crawler(self.crawler) self.crawler.spider = self.spider self.crawl_manager = self.create_crawl_manager() self.crawl_manager.crawler = self.crawler self.item = Item() self.response = Response('http://localhost') self.another_spider = MetaSpider.from_crawler(self.crawler)
def describe_item_result_handling(): single_item = Item() mixed_items = [ Item(), MyItem() ] complete_mix = mixed_items + [ Request("http://test.com") ] def it_can_extract_item_objects(): assert items_in_parse_result([single_item]) == [single_item] def it_tolerates_single_elements(): assert items_in_parse_result(single_item) == [single_item] assert requests_in_parse_result(single_item) == [] def it_tolerates_items_mixed_in_between(): assert items_in_parse_result(complete_mix) == mixed_items def it_tolerates_different_item_types(): assert items_in_parse_result(mixed_items) == mixed_items def it_counts_the_items_and_other_results(): assert count_requests_in_parse_result(complete_mix) == 1 assert count_items_in_parse_result(complete_mix) == 2
def handle_1(self, response): item1 = response.meta.get("item1") req = response.meta.get("req") fragments = re.findall("url \+= '(.*?)'", response.text, re.S) detail_url = '' for j in fragments: detail_url += j item2 = Item() item2.fields["NewUrl"] = Field() item2["NewUrl"] = detail_url item2.update(item1) yield scrapy.Request(url=detail_url, callback=self.parse, meta={"req": req, "item2": item2, "last_page": True})
def handle_1(self, response): item = response.meta.get("item") item1 = Item() PropertyAddress = response.xpath( "//div[contains(@class,'rent-top')]/a/text()").extract_first() PriceUnit = response.xpath( "//div[contains(@class,'junjia')]/span/text()").extract_first() ls_detail = response.xpath( "//div[@class='xqfangs detail_bor_bottom']/ul[@class='clear']/li/text()" ).extract() BuildedTime = None BuildingType = None for detail in ls_detail: if "年" in detail: BuildedTime = detail else: BuildingType = detail PropertyCompany = response.xpath( "//ul/li[@class='wuyes']/em/text()").extract_first() Developers = response.xpath( "//ul/li[@class='kaifas']/em/text()").extract_first() TotalBuilding = response.xpath( "//div[@class='xqsaleinfo']/ul/li[1]/span/text()").extract_first() TotalHouseholds = response.xpath( "//div[@class='xqsaleinfo']/ul/li[2]/span/text()").extract_first() NearbyStores = response.xpath( "//div[@class='xqsaleinfo']/ul/li[6]/span/text()").extract_first() AroundTraffic = response.xpath( "//div[@class='xqsaleinfo']/ul/li[5]/span/text()").extract_first() item1.fields["PropertyAddress"] = Field() item1["PropertyAddress"] = PropertyAddress item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = PriceUnit item1.fields["BuildedTime"] = Field() item1["BuildedTime"] = BuildedTime item1.fields["BuildingType"] = Field() item1["BuildingType"] = BuildingType item1.fields["PropertyCompany"] = Field() item1["PropertyCompany"] = PropertyCompany item1.fields["Developers"] = Field() item1["Developers"] = Developers item1.fields["TotalBuilding"] = Field() item1["TotalBuilding"] = TotalBuilding item1.fields["TotalHouseholds"] = Field() item1["TotalHouseholds"] = TotalHouseholds item1.fields["NearbyStores"] = Field() item1["NearbyStores"] = NearbyStores item1.fields["AroundTraffic"] = Field() item1["AroundTraffic"] = AroundTraffic item1.update(item) yield item1
def start_requests(self): for kw in self.kw_list: start_page = 1 for page in range(start_page, 11): item = Item() item.fields["SearchWord"] = Field() item.fields["Page"] = Field() item["SearchWord"] = kw item["Page"] = page start_url = 'https://weixin.sogou.com/weixin?query={}&type=2&page={}&ie=utf8'.format( parse.quote(kw), str(page)) yield scrapy.Request(url=start_url, callback=self.parse, meta={"start_url": start_url, "item": item})
def page_handle(self, response): region = response.meta.get("region") plate = response.meta.get("plate") plate_url = response.meta.get("plate_url") try: housing_num_flag = response.xpath( "//h2[contains(@class,'total')]/span/text()").extract_first( ).strip() except: housing_num_flag = '0' if housing_num_flag != "0": housing_list = response.xpath( "//div[@class='leftContent']/ul[@class='sellListContent']/li[contains(@class, 'clear')]/div[@class='info clear']" ) for housing in housing_list: # 列表页面字段获取 item = Item() item.fields["AreaName"] = Field() item["AreaName"] = region item.fields["PlateName"] = Field() item["PlateName"] = plate housing_url = housing.xpath( "./div[@class='title']/a/@href").extract_first() HouseDesc = housing.xpath( "./div[@class='title']/a/text()").extract_first() item.fields["HouseDesc"] = Field() item["HouseDesc"] = HouseDesc item.fields["HouseUrl"] = Field() item["HouseUrl"] = housing_url yield scrapy.Request(url=housing_url, callback=self.housing_handle, meta={"item": deepcopy(item)}, headers=self.get_headers()) page_dict_handle = response.xpath( "//div[contains(@class, 'page-box')]/@page-data" ).extract_first() page_dict = json.loads(page_dict_handle) total_page = page_dict.get("totalPage") current_page = page_dict.get("curPage") if current_page < total_page: next_page = plate_url + "pg" + str(current_page + 1) + "/" yield scrapy.Request( url=next_page, callback=self.page_handle, meta={ "plate_url": plate_url, 'region': region, "plate": plate }, headers=self.get_headers(), )
def parse(self, response): req = response.meta.get("req") item = response.meta.get("item") url_list_handle = Selector(text=response.text) url_list = url_list_handle.xpath("//div[@class='txt-box']/h3/a/@href").extract() for index, url in enumerate(url_list): item1 = Item() item1.fields["Located"] = Field() item1["Located"] = index url = self.base_url + url url = self.get_real_url_handle(url) item1.update(item) yield scrapy.Request(url=url, callback=self.handle_1, meta={"req": req, "item1": item1})
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware): assert hs_spider_middleware._seen_requests == WeakKeyDictionary() assert hs_downloader_middleware._seen_requests == WeakKeyDictionary() assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests spider = Spider('test') url = 'http://resp-url' request_0 = Request(url) response_0 = Response(url) hs_downloader_middleware.process_request(request_0, spider) assert HS_REQUEST_ID_KEY not in request_0.meta assert HS_PARENT_ID_KEY not in request_0.meta assert len(hs_spider_middleware._seen_requests) == 0 assert len(hs_downloader_middleware._seen_requests) == 0 hs_downloader_middleware.process_response(request_0, response_0, spider) assert request_0.meta[HS_REQUEST_ID_KEY] == 0 assert request_0.meta[HS_PARENT_ID_KEY] is None assert hs_spider_middleware._seen_requests[request_0] == 0 response_0.request = request_0 request_1 = Request(url) request_2 = Request(url) item1 = {} item2 = Item() output = [request_1, request_2, item1, item2] processed_output = list( hs_spider_middleware.process_spider_output(response_0, output, spider)) assert processed_output[0] is request_1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[1] is request_2 assert request_2.meta[HS_PARENT_ID_KEY] == 0 assert processed_output[2] is item1 assert processed_output[3] is item2 response_1 = Response(url) hs_downloader_middleware.process_request(request_1, spider) hs_downloader_middleware.process_response(request_1, response_1, spider) assert request_1.meta[HS_REQUEST_ID_KEY] == 1 assert request_1.meta[HS_PARENT_ID_KEY] == 0 response_2 = Response(url) hs_downloader_middleware.process_request(request_2, spider) hs_downloader_middleware.process_response(request_2, response_2, spider) assert request_2.meta[HS_REQUEST_ID_KEY] == 2 assert request_2.meta[HS_PARENT_ID_KEY] == 0
def start_requests(self): item = Item() for url in self.start_urls: url_slit = url.split(":") site_name = url_slit[0] url = ":".join(url_slit[1:-1]) item.fields["site_name"] = Field() item['site_name'] = site_name item.fields["source_url"] = Field() item['source_url'] = url_slit[-1] yield scrapy.Request(url=url, meta={"item": item}, dont_filter=True, callback=self.parse_url)
def handle_2(self, response): item2 = response.meta.get("item2") item3 = Item() item3.fields["Title"] = Field() item3.fields["Content"] = Field() res_text = response.text res3_handle = Selector(text=res_text) # title title = res3_handle.xpath('//meta[@property="og:title"]/@content').extract_first() try: content = res3_handle.xpath("//div[@id='js_content']").xpath("string(.)").extract_first() content = content.strip() except Exception as e: content = None item3["Title"] = title item3["Content"] = content item3.update(item2) yield item3
def second_parse(self, response): obj = json.loads(response.text) data = self.get_home_data(obj) for index, url in enumerate(data['house_url']): # TODO 新增判断网页重复 item = Item() item.fields["AreaName"] = Field() item.fields["PlateName"] = Field() item.fields["HouseUrl"] = Field() item.fields["BuildedTime"] = Field() item.fields["TimeToRelease"] = Field() item.fields["PropertyAddress"] = Field() # item.fields["Floor"] = Field() item.fields["TotalPrice"] = Field() item.fields["BuildingSquare"] = Field() item.fields["PropertyCommunity"] = Field() item.fields["HasElevator"] = Field() item['AreaName'] = data['area'][index] item['PlateName'] = data['road'][index] flag = data['house_url'] if flag: house_url = flag[index] if house_url: item['HouseUrl'] = self.base_url + house_url item['BuildedTime'] = data['build_time'][index] item['TimeToRelease'] = time.strftime( "%Y-%m-%d", time.localtime( int(str(data['release_time'][index])[:-3]))) item['PropertyAddress'] = data['addr'][index] # item['Floor'] = data['floor'][index] item['TotalPrice'] = data['total_price'][index] item['BuildingSquare'] = data['build_size'][index] item['PropertyCommunity'] = data['community'][index] item['HasElevator'] = data['elevator'][index] try: next_url = "https://www.dafangya.com" + url except: continue yield scrapy.Request(url=next_url, callback=self.third_parse, meta={'item': item}, dont_filter=True)