def parse(self, response): print(self.hello) print('--' * 1000) li_list = response.xpath("//ul[@class='title-state-ul']/li") for li in li_list: item = YangguangItem() item['id'] = li.xpath( "./span[@class='state1']/text()").extract_first() item['title'] = li.xpath( "./span[@class='state3']/a/text()").extract_first() item['href'] = li.xpath( "./span[@class='state3']/a/@href").extract_first() item['href'] = 'http://wzzdg.sun0769.com/' + item['href'] item['public_date'] = li.xpath( "./span[@class='state5']/text()").extract_first() item['sleep_date'] = li.xpath( "./span[@class='state4']/text()").extract_first() yield scrapy.Request(item['href'], callback=self.parse_detail, meta={'item': item}) # 翻页 next_url = response.xpath( "//a[@class='arrow-page prov_rota']/@href").get() next_url = 'http://wzzdg.sun0769.com' + next_url if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): # self.settings["MONGO_HOST"] # self.settings.get("MONGO_HOST","") #分组 print(self.hello, "*" * 100) item = YangguangItem() tr_list = response.xpath( "//div[@class='greyframe']//table[2]//tr/td/table//tr") for tr in tr_list: item['id'] = tr.xpath("./td[1]/text()").extract_first() item['title'] = tr.xpath("./td[2]/a[2]/text()").extract_first() item['content_url'] = tr.xpath( "./td[2]/a[2]/@href").extract_first() item['city'] = tr.xpath("./td[2]/a[3]/text()").extract_first() item['state'] = tr.xpath("./td[3]/span/text()").extract_first() item['complainant'] = tr.xpath("./td[4]/text()").extract_first() item['time'] = tr.xpath("./td[5]/text()").extract_first() # logger.warning(item) # yield item yield scrapy.Request(item['content_url'], callback=self.parse_detail, meta={"item": item}) next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url != None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): tr_list = response.xpath( '//div[@class="width-12"]//ul[@class="title-state-ul"]/li') for tr in tr_list: item = YangguangItem() item["title_NO"] = tr.xpath( './span[@class="state1"]/text()').extract_first() item["status"] = tr.xpath( './span[@class="state2"]/text()').extract_first() item["title"] = tr.xpath( './span[@class="state3"]/a/text()').extract_first() item["title_url"] = tr.xpath( './span[@class="state3"]/a/@href').extract_first() item["title_url"] = self.base_url + item["title_url"] item["askTime"] = tr.xpath( './span[@class="state4"]/text()').extract_first() item["answerTime"] = tr.xpath( './span[@class="state5 "]/text()').extract_first() # print(item) logger.warning(item) yield scrapy.Request(item["title_url"], callback=self.parse_detail, meta={"item": item}) next_url = response.xpath( '//div[@class="mr-three paging-box"]//a[@class="arrow-page prov_rota"]/@href' ).extract_first() if next_url is not None: yield scrapy.Request(self.base_url + next_url, callback=self.parse)
def parse(self, response): # print(response.) # 分组 tr_list = response.xpath('//div[@id="morelist"]//table//table//tr') # print(len(tr_list)) for tr in tr_list: item = YangguangItem() item['title'] = tr.xpath( './/a[@class="news14"]/text()').extract_first() item['author'] = tr.xpath('.//td[last()-1]/text()').extract_first() item['href'] = tr.xpath('.//td[2]/a[2]/@href').extract_first() item['status'] = tr.xpath( './/td[last()-2]/span/text()').extract_first() item['publish_time'] = tr.xpath( './/td[last()]/text()').extract_first() # 处理详情页 yield scrapy.Request(item['href'], callback=self.parse_detail, meta={"item": item}) # 构造下一页url page = response.xpath( '//div[@class="pagination"]/span/text()').extract_first() next_url = response.xpath('//a[text()=">"]/@href').extract_first() if next_url is not None: print('第{}页成功'.format(page)) yield scrapy.Request(next_url, callback=self.parse)
def parse_item(self, response): item = YangguangItem() item['title'] = re.search(r"提问:(.*)", response.css("div.wzy1 span.niae2_top::text").extract_first()).group(1) item['number'] = re.search(r"编号:(.*)", response.css("div.wzy1 span.niae2_top+span::text").extract_first()).group(1) item['content'] = response.css("div.wzy1 table:nth-child(2) tr:first-child td::text").extract_first().strip('\xa0') item['url'] = response.url yield item
def parse(self, response): tr_list = response.xpath("//div[@class='pagecenter']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath("./td[2]/a[@class='news14]/@title").extract_first() item["href"] = tr.xpath("./td[2]/a[@class='news14]/@href").extract_first() item["publish_date"] = tr.xpath("./td[last()]/text()").extract_first() yield scrapy.Request{ item["href"] callback = self.parse_detail(meta) meta = {"item": item} }
def parse(self, response): tr_list = response.xpath("//div[@class='newsHead clearfix']/table[2]/tr") for tr in tr_list: item = YangguangItem() item['title'] = tr.xpath("./td[3]/a[1]/text()").extract_first() item['href'] = tr.xpath("./td[3]/a[1]/@href").extract_first() item['publish_data'] = tr.xpath("./td[6]/text()").extract_first() yield scrapy.Request( item['href'], callback = self.parse_detail, meta = {'item':item} )
def parse(self, response): tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() #实例化一个Itme item["num"] = tr.xpath("./td[1]/text()").extract_first() item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first() item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first() item["stats"] = tr.xpath("./td[3]/span/text()").extract_first() item["author_name"] = tr.xpath("./td[4]/text()").extract_first() yield scrapy.Request( #发送关于详情页的请求 item["href"], callback=self.parse_detail, meta={"item":item} #meta携带参数,meta是个字典,给回调函数的parse传递参数 )
def parse(self, response): # 提取当前页的数据 # 分组在提取 tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") # 获取下一页 print(tr_list) for tr in tr_list: item = YangguangItem() item['num'] = tr.xpath("./td[1]/text()").extract_first() item['title'] = tr.xpath("./td[2]/a/text()").extract_first() item['href'] = tr.xpath("./td[2]/a/text()").extract_first() item['status'] = tr.xpath("./td[3]/span/text()").extract_first() item['name'] = tr.xpath("./td[4]/text()").extract_first() item['publish_data'] = tr.xpath("./td[5]/text()").extract_first() print(item)
def parse(self, response): tr_list = response.xpath('//div[@class="greyframe"]/table[2]/tr/td/table/tr') # 分组 for tr in tr_list: item = YangguangItem() item["ID"] = tr.xpath('./td[1]/text()').extract_first() item["title"] = tr.xpath('./td[2]/a[2]/@title').extract_first() item["href"] = tr.xpath('./td[2]/a[2]/@href').extract_first() item["status"] = tr.xpath('./td[3]/span/text()').extract_first() item["user_name"] = tr.xpath('./td[4]/text()').extract_first() item["publish_time"] = tr.xpath('./td[5]/text()').extract_first() yield scrapy.Request(item["href"], callback=self.parse_detail, meta={"item": item}) # 翻页 next_url = response.xpath('//div[@class="pagination"]/a[text()=">"]/@href').extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") print(len(tr_list)) for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath("./td[2]/a[@class='news14']/@title").get() item["href"] = tr.xpath("./td[2]/a[@class='news14']/@href").get() item["publish_date"] = tr.xpath("./td[last()]/text()").get() yield scrapy.Request(item['href'], callback=self.parse_detail, meta={"item": deepcopy(item)}) #翻页 next_url = response.xpath("//a[text()='>']/@href").get() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): tr_list = response.xpath( "//div[@class='newsHead clearfix']/table[2]/tr") for tr in tr_list: item = YangguangItem() item['title'] = tr.xpath("./td[3]/a[1]/@title").extract_first() item['href'] = tr.xpath('./td[3]/a[1]/@href').extract_first() item['update_time'] = tr.xpath( './td[last()]/text()').extract_first() yield scrapy.Request( item['href'], callback=self.parse_detail, # 指定处理详情页的函数 meta={"item": item}) # 构建下一页url地址 next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): tr_list=response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item=YangguangItem() item['title']=tr.xpath("./td[2]/a[@class='new14']/@title").extract_first() item['href']=tr.xpath("./td[2]/a[@class='new14']/@href").extract_first() item['publish_date']=tr.xpath("./td[last()]/text()").extract_first() yield scrapy.Request( item["href"], callback=self.parse_detail, meta={'item':item} ) next_url=response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse )
def parse(self, response): # 由pinelines.py中的open_spider还是在爬虫创建执行赋值的属性 可以调用 # print(self.hello,'*'*100) li_list = response.css('body > div.public-content > div.width-12 > ul.title-state-ul > li') for li in li_list: item = YangguangItem() item['id'] = li.css('.state1::text').extract_first() item['title'] = li.css('.state3 .color-hover::text').extract_first() item["href"] = 'http://wz.sun0769.com' + li.css('.state3 .color-hover::attr(href)').extract_first() item['create_time'] = li.css('.state5::text').extract_first() # 解析出对应的连接跳转 并用callback函数进行处理,并用meta传递参数 yield scrapy.Request(url=item["href"], callback=self.parse_detail, meta={"item": item}) # 实现翻页 next_url = 'http://wz.sun0769.com' + response.css('.prov_rota::attr(href)').extract_first() print('下一页:' + next_url) if next_url is not None: yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() item["number"] = tr.xpath("./td[1]/text()").extract_first() item["title"] = tr.xpath( "./td[2]/a[@class='news14']/text()").extract_first() item["href"] = tr.xpath( "./td[2]/a[@class='news14']/@href").extract_first() item["name"] = tr.xpath("./td[4]/text()").extract_first() item["public_time"] = tr.xpath("./td[5]/text()").extract_first() yield scrapy.Request(item["href"], callback=self.parse_content, meta={"item": item}) next_url = response.xpath("//a[text()='>']/@href").extract_first() if not next_url: yield scrapy.Request(next_url, callback=self.parse)
def parse_item(self, response): item = YangguangItem() # 标题 item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract_first() # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 文字内容,默认先取出有图片情况下的文字内容列表 content = response.xpath('//div[@class="contentext"]/text()').extract_first() # 如果没有内容,则取出没有图片情况下的文字内容列表 if not content: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract_first() # content为列表,通过join方法拼接为字符串,并去除首尾空格 item['content'] = content else: item['content'] = content # 链接 item['url'] = response.url yield item
def parse(self, response): tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() item['num'] = tr.xpath("./td[1]/text()").extract_first() item['title'] = tr.xpath("./td[2]/a[2]/text()").extract_first() item['href'] = tr.xpath("./td[2]/a[2]/@href").extract_first() item['status'] = tr.xpath("./td[3]/span/text()").extract_first() item['author'] = tr.xpath("./td[4]/text()").extract_first() item['publish_date'] = tr.xpath("./td[5]/text()").extract_first() # yield item yield scrapy.Request( item['href'], callback=self.parse_detail, meta={'item': item} ) next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): #分组 tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath( "./td/a[@class='news14']/@title").extract_first() item["href"] = tr.xpath( "./td/a[@class='news14']/@href").extract_first() item["publish_date"] = tr.xpath( "./td[@class='t12wh']/text()").extract_first() yield scrapy.Request(item["href"], callback=self.parse_detail, meta={"item": item}) #翻页 next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): tr_list = response.xpath('//div[@class= "greyframe"]/table[2]/tbody/tr/td/table/tr') for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath('./td[2]/a[@class="news14"]/@title').extract_first() item["href"] = tr.xpath('./td[2]/a[@class="news14"]/@href').extract_first() item["publish_date"] = tr.xpath('//div[@class= "greyframe"]/table[2]/tbody/tr/td/table/tbody/tr/td[@class="t12wh"]/text()').extract_first() yield scrapy.Request( item["href"], callback=self.parse_detail, meta={"item":item} ) next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request( next_url, callback=self.parse )
def parse(self, response): li_list = response.xpath( '//div[@class="width-12"]/ul[@class="title-state-ul"]/li') for li in li_list: item = YangguangItem() item['title'] = li.xpath( './span[@class="state3"]/a/text()').extract_first() item['href'] = 'http://wz.sun0769.com' + li.xpath( './span[@class="state3"]/a/@href').extract_first() yield scrapy.Request(url=item['href'], callback=self.parse_detail, meta={'item': item}) next_url = 'http://wz.sun0769.com' + response.xpath( '//div[@class="mr-three paging-box"]/a[2]/@href').extract_first() # print(next_url) if next_url is not None: yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): html = response.xpath("//td[@valign='top']//tr") # print(html) for html_list in html: # item = {} item = YangguangItem() item["number"] = html_list.xpath(".//td[1]/text()").extract_first() item["href"] = html_list.xpath( ".//td[2]/a[2]/@href").extract_first() item["title"] = html_list.xpath( ".//td[2]/a[2]/@title").extract_first() item["time"] = html_list.xpath( ".//td[last()]/text()").extract_first() # print(item) yield scrapy.Request(item["href"], callback=self.details, meta={"item": item}) next_url = html.xpath("//a[text()='>']/@href").extract_first() print(next_url) if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): # self.settings["MONGO_HOST"] # self.settings.get("MONGO_HOST","") # print(self.hello,"*"*100) #分组 tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath( "./td[2]/a[@class='news14']/@title").extract_first() item["href"] = tr.xpath( "./td[2]/a[@class='news14']/@href").extract_first() item["publish_date"] = tr.xpath( "./td[last()]/text()").extract_first() yield scrapy.Request(item["href"], callback=self.parse_detail, meta={"item": item}) #翻页 next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): li_list = response.xpath('//li[@class="clear"]') for li in li_list: item = YangguangItem() item['id'] = li.xpath('./span[@class="state1"]/text()').get() item['status'] = li.xpath( './span[@class="state2"]/text()').get().strip() item['title'] = li.xpath('./span[@class="state3"]/a/text()').get() item['href'] = BASE_URL + li.xpath( './span[@class="state3"]/a/@href').get() item['response_time'] = re.findall( '等待处理:(.*)', li.xpath('./span[@class="state4"]/text()').get().strip())[0] item['publish_time'] = response.xpath( '/html/body/div[2]/div[3]/ul[2]/li[1]/span[5]/text()').get() yield scrapy.Request(item['href'], callback=self.parse_detail, meta={'item': item}) # 翻页 next_url = BASE_URL + response.xpath( '//div[@class="mr-three paging-box"]/a[2]/@href').get() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): print('spider start') # 分组 try: li_list = response.css('li.clear') for li in li_list: item = YangguangItem() item['id'] = li.css('span.state1::text').get() item['state'] = li.css('span.state2::text').get() item['title'] = li.xpath( "span[@class='state3']/a[@class='color-hover']/text()" ).get() item['href'] = li.xpath( "span[@class='state3']/a[@class='color-hover']/@href").get( ) item['href'] = 'http://wz.sun0769.com' + item['href'] # item['href'] = ['http://wz.sun0769.com'+ i for i in item['href']] item['sleepTime'] = li.css('span.state4::text').get() item['time'] = li.css('span.state5::text').get() # print(item) yield scrapy.Request(item['href'], callback=self.parseDetail, meta={"item": item}) # 处理详情页面 # 翻页 next_url = response.xpath( "//div[@class='mr-three paging-box']/a/@href").get() print(next_url) next_url = 'http://wz.sun0769.com' + next_url if next_url is not None: yield scrapy.Request(next_url, callback=self.parse) print(next_url) # next_url = response.xpath("//a[text()='>']/@href") except TypeError: print('爬虫完成')
def parse(self, response): # 解析start_urls # host = self.settings["MONGO_HOST"] # 方式1 # host = self.settings.get("MONGO_HOST") # 方式2,推荐 tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr/td/table/tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath( "./td[2]/a[@class='news14']/@title").extract_first() item["href"] = tr.xpath( "./td[2]/a[@class='news14']/@href").extract_first() item["publish_date"] = tr.xpath("./td[5]/text()").extract_first() print(item) # 提交(yield)至详情页方法,并将item传递给详情页(meta) yield scrapy.Request(item["href"], callback=self.parse_detail, meta={"item": item}) # 找寻下一页的地址 next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse) # 提交(yield)至解析页方法parse
def parse(self, response): tr_list = response.xpath( "//div[@class='newsHead clearfix']/table[2]/tr") for tr in tr_list: item = YangguangItem() item['title'] = tr.xpath("./td[3]/a[1]/@title").extract_first() item['href'] = tr.xpath("./td[3]/a[1]/@href").extract_first() item['publish_date'] = tr.xpath( "./td[last()]/text()").extract_first() yield scrapy.Request(item['href'], callback=self.parse_dateil, meta={'item': item}) #翻页 next_url = response.xpath("//a[text()='>']/@href").extract_first() # print(next_url) if next_url is not None: yield scrapy.Request( next_url, callback=self.parse, dont_filter=True, )
def parse(self, response): tr_list = response.xpath( "//div[@class='greyframe']/table[2]/tr//table/tr") for tr in tr_list: item = YangguangItem() item["title"] = tr.xpath("./td[2]/a[2]/text()").extract_first() item["status"] = tr.xpath("./td[3]/span/text()").extract_first() item["name"] = tr.xpath("./td[4]/text()").extract_first() item["publish_date"] = tr.xpath("./td[5]/text()").extract_first() # 每个问题详情页的url地址 detail_url = tr.xpath( "./td[2]/a[@class='news14']/@href").extract_first() # 请求详情页 yield scrapy.Request(detail_url, callback=self.parse_detail, meta={"item": item}) next_url = response.xpath("//a[text()='>']/@href").extract_first() if next_url is not None: yield scrapy.Request(next_url, callback=self.parse)