def parse_data(self, response): item = AqiItem() # 取出所有行数的tr标签 tr_list = response.xpath('//tr') # 删除第一行,表头 tr_list.pop(0) # 遍历tr,取出数据 for tr in tr_list: # 日期 item['date'] = tr.xpath('./td[1]/text()').extract() # AQI item['aqi'] = tr.xpath('./td[2]/text()').extract() # 质量等级 item['level'] = tr.xpath('./td[3]/span/text()').extract() # PM2.5 item['PM2_5'] = tr.xpath('./td[4]/text()').extract() # PM10 item['PM10'] = tr.xpath('./td[5]/text()').extractt() # SO2 item['SO2'] = tr.xpath('./td[6]/text()').extract() # CO item['CO'] = tr.xpath('./td[7]/text()').extract() # NO2 item['NO2'] = tr.xpath('./td[8]/text()').extract() # O3 8h item['O3'] = tr.xpath('./td[9]/text()').extract() yield item
def parse_day(self, response): item = AqiItem() title = response.xpath('//*[@id="title"]/text()').extract_first() item['city_name'] = title[8:-11] # 1. 取出所有 tr_list tr_list = response.xpath('//tr') # 2.删除表头 tr_list.pop(0) for tr in tr_list: # 日期 item['date'] = tr.xpath('./td[1]/text()').extract_first() # AQI item['aqi'] = tr.xpath('./td[2]/text()').extract_first() # 质量等级 item['level'] = tr.xpath('./td[3]//text()').extract_first() # PM2.5 item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first() # PM10 item['pm10'] = tr.xpath('./td[5]/text()').extract_first() # 二氧化硫 item['so_2'] = tr.xpath('./td[6]/text()').extract_first() # 一氧化碳 item['co'] = tr.xpath('./td[7]/text()').extract_first() # 二氧化氮 item['no_2'] = tr.xpath('./td[8]/text()').extract_first() # 臭氧 item['o_3'] = tr.xpath('./td[9]/text()').extract_first() # 将数据 -->engine-->pipeline yield item
def parse_day(self, response): # 获取所有的数据节点 node_list = response.xpath('//tr') city = response.xpath('//div[@class="panel-heading"]/h3/text()' ).extract_first().split('2')[0] # 遍历数据节点列表 for node in node_list: # 创建存储数据的item容器 item = AqiItem() # 先填写一些固定参数 item['city'] = city item['url'] = response.url item['timestamp'] = time.time() # 数据 item['date'] = node.xpath('./td[1]/text()').extract_first() item['AQI'] = node.xpath('./td[2]/text()').extract_first() item['LEVEL'] = node.xpath('./td[3]/span/text()').extract_first() item['PM2_5'] = node.xpath('./td[4]/text()').extract_first() item['PM10'] = node.xpath('./td[5]/text()').extract_first() item['SO2'] = node.xpath('./td[6]/text()').extract_first() item['CO'] = node.xpath('./td[7]/text()').extract_first() item['NO2'] = node.xpath('./td[8]/text()').extract_first() item['O3'] = node.xpath('./td[9]/text()').extract_first() # for k, v in item.items(): # print(k, v) # print('=========================') # 将数据返回给引擎 yield item
def parse_day(self, response): """ 解析目标数据 每天的数据 :param response: :return: """ item = AqiItem() title = response.xpath('//*[@id="title"]/text()').extract_first() item['city_name'] = title[8:-11] tr_list = response.xpath('//tr') tr_list.pop(0) for tr in tr_list: # 日期 item['date'] = tr.xpath('./td[1]/text()').extract_first() # AQI item['aqi'] = tr.xpath('./td[2]/text()').extract_first() # 质量等级 item['level'] = tr.xpath('./td[3]//text()').extract_first() # PM2.5 item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first() # PM10 item['pm_10'] = tr.xpath('./td[5]/text()').extract_first() # 二氧化硫 item['so_2'] = tr.xpath('./td[6]/text()').extract_first() # 一氧化碳 item['co'] = tr.xpath('./td[7]/text()').extract_first() # 二氧化氮 item['no_2'] = tr.xpath('./td[8]/text()').extract_first() # 臭氧 item['o_3'] = tr.xpath('./td[9]/text()').extract_first() yield item
def parse_day(self, response): ''' 从Response里获取每个城市每个月的每一天数据,并存储到item ''' url = response.url urlencode_city = url[url.find("=") + 1:url.rfind("&")] # 将url编码的字符串,转为UTF-8字符串 city = urllib.unquote(urlencode_city) tr_list = response.xpath("//div[@class='row']//tr") tr_list.pop(0) for tr in tr_list: item = AqiItem() item['city'] = city.decode("utf-8") item['date'] = tr.xpath("./td[1]/text()").extract_first() item['aqi'] = tr.xpath("./td[2]/text()").extract_first() item['level'] = tr.xpath("./td[3]/span/text()").extract_first() item['pm2_5'] = tr.xpath("./td[4]/text()").extract_first() item['pm10'] = tr.xpath("./td[5]/text()").extract_first() item['so2'] = tr.xpath("./td[6]/text()").extract_first() item['co'] = tr.xpath("./td[7]/text()").extract_first() item['no2'] = tr.xpath("./td[8]/text()").extract_first() item['o3'] = tr.xpath("./td[9]/text()").extract_first() yield item
def parse(self, response): city_name_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()').extract()[2:3] city_link_list = response.xpath('/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href').extract()[2:3] for city_name, city_link in zip(city_name_list, city_link_list): item = AqiItem() item['city_name'] = city_name # 拼接url city_url = self.base_url + city_link # 发送城市月份天气请求 yield scrapy.Request(city_url, callback=self.parse_month, meta={'aqi': item})
def parse(self, response): # 所有的城市列表的href monthdata_href_list = response.xpath( '//div[@class="all"]//ul//a/@href')[:3] for monthdata_href in monthdata_href_list: item = AqiItem() url = 'https://www.aqistudy.cn/historydata/' + monthdata_href.extract( ) item['city'] = url[55:] # print(url) yield scrapy.Request(url, callback=self.parse_monthdata, meta={'item': item})
def parse(self, response): item = AqiItem() city_name_list = response.xpath( '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()' ).extract() city_link_list = response.xpath( '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href' ).extract() for city_name, city_link in zip(city_name_list, city_link_list): month_url = self.base_url + city_link item["city_name"] = city_name yield scrapy.Request(url=month_url, meta={"aqi_item": item}, callback=self.month_parse)
def parse(self, response): city_name_list = response.xpath( '//div[@class="bottom"]/ul/div[2]/li/a/text()').extract() city_link_list = response.xpath( '//div[@class="bottom"]/ul/div[2]/li/a/@href').extract() # print(city_name_list, city_link_list) for city_name, city_link in zip(city_name_list, city_link_list): item = AqiItem() # item = {} item['city_name'] = city_name city_url = self.base_url + city_link item['city_url'] = city_url # yield scrapy.FormRequest(city_url, callback=self.parse_month, meta={'aqi': item}) yield item
def parse(self, response): city_name_list = response.xpath( '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/text()' ).extract()[36:37] city_link_list = response.xpath( '/html/body/div[3]/div/div[1]/div[2]/div[2]/ul/div[2]/li/a/@href' ).extract()[36:37] for city_name, city_link in zip(city_name_list, city_link_list): item = AqiItem() item['city_name'] = city_name url = self.base_url + city_link yield scrapy.Request(url, meta={'citykey': item}, callback=self.parse_month)
def parse_day(self, response): city_name = response.meta['city'] node_list = response.xpath('//tbody/tr') node_list.pop(0) for node in node_list: item = AqiItem() item['city'] = city_name item['date'] = node.xpath('./td[1]/text()').extract_first() item['aqi'] = node.xpath('./td[2]/text()').extract_first() item['level'] = node.xpath('./td[3]/span/text()').extract_first() item['pm2_5'] = node.xpath('./td[4]/text()').extract_first() item['pm10'] = node.xpath('./td[5]/text()').extract_first() item['so2'] = node.xpath('./td[6]/text()').extract_first() item['co'] = node.xpath('./td[7]/text()').extract_first() item['no2'] = node.xpath('./td[8]/text()').extract_first() item['o3'] = node.xpath('./td[9]/text()').extract_first() yield item
def parse_day(self, response): node_list = response.xpath("//tr") node_list.pop(0) city = response.xpath("//h2[@id='title']/text()").extract()[0] for node in node_list: item = AqiItem() item['city'] = city[8:-11] item['date'] = node.xpath("./td[1]/text()") item['aqi'] = node.xpath("./td[2]/text()") item['level'] = node.xpath("./td[3]/span/text()") item['pm2_5'] = node.xpath("./td[4]/text()") item['pm10'] = node.xpath("./td[5]/text()") item['so2'] = node.xpath("./td[6]/text()") item['co'] = node.xpath("./td[7]/text()") item['no2'] = node.xpath("./td[8]/text()") item['o3'] = node.xpath("./td[9]/text()") yield item
def parse_day(self, response): # 根据标题下标获取城市名 title = response.xpath("//h2[@id='title']/text()").extract_first() city_name = title[:title.find(u'空气')] node_list = response.xpath('//tbody/tr') node_list.pop(0) for node in node_list: item = AqiItem() item['city'] = city_name item['date'] = node.xpath('./td[1]/text()').extract_first() item['aqi'] = node.xpath('./td[2]/text()').extract_first() item['level'] = node.xpath('./td[3]/span/text()').extract_first() item['pm2_5'] = node.xpath('./td[4]/text()').extract_first() item['pm10'] = node.xpath('./td[5]/text()').extract_first() item['so2'] = node.xpath('./td[6]/text()').extract_first() item['co'] = node.xpath('./td[7]/text()').extract_first() item['no2'] = node.xpath('./td[8]/text()').extract_first() item['o3'] = node.xpath('./td[9]/text()').extract_first() yield item
def parse_day(self, response): node_list = response.xpath("//tr") node_list.pop(0) # extract_first() 直接获取列表里的第一个元素并返回(不会返回列表) # extract() 返回列表 for node in node_list: item = AqiItem() item['city'] = response.meta["city_name"] item['date'] = node.xpath("./td[1]/text()").extract()[0] item['aqi'] = node.xpath("./td[2]/text()").extract()[0] item['level'] = node.xpath("./td[3]/span/text()").extract()[0] item['pm2_5'] = node.xpath("./td[4]/text()").extract()[0] item['pm10'] = node.xpath("./td[5]/text()").extract()[0] item['so2'] = node.xpath("./td[6]/text()").extract()[0] item['co'] = node.xpath("./td[7]/text()").extract()[0] item['no2'] = node.xpath("./td[8]/text()").extract()[0] item['o3'] = node.xpath("./td[9]/text()").extract()[0] yield item
def parse_day(self, response): node_list = response.xpath("//div[@class='row']//tbody/tr") if not len(node_list): return node_list.pop(0) for node in node_list: item = AqiItem() item['city'] = response.meta['city'] item["date"] = node.xpath("./td[1]//text()").extract_first() item["aqi"] = node.xpath("./td[2]//text()").extract_first() item["level"] = node.xpath("./td[3]//text()").extract_first() item["pm2_5"] = node.xpath("./td[4]//text()").extract_first() item["pm10"] = node.xpath("./td[5]//text()").extract_first() item["so2"] = node.xpath("./td[6]//text()").extract_first() item["co"] = node.xpath("./td[7]//text()").extract_first() item["no2"] = node.xpath("./td[8]//text()").extract_first() item["o3"] = node.xpath("./td[9]//text()").extract_first() yield item
def parse_day(self, response): ''' 从Response里获取每个城市每个月的每一天数据,并保存到item ''' tr_list = response.xpath("//div[@class='row']//tr") # 删除第一个元素 标题 # tr_list.pop(0) for tr in tr_list: item = AqiItem() item['city'] = response.meta["name"] item['date'] = tr.xpath("./td[1]/text()").extract_first() item['aqi'] = tr.xpath("./td[2]/text()").extract_first() item['level'] = tr.xpath("./td[3]/span/text()").extract_first() item['pm2_5'] = tr.xpath("./td[4]/text()").extract_first() item['pm10'] = tr.xpath("./td[5]/text()").extract_first() item['so2'] = tr.xpath("./td[6]/text()").extract_first() item['co'] = tr.xpath("./td[7]/text()").extract_first() item['no2'] = tr.xpath("./td[8]/text()").extract_first() item['o3'] = tr.xpath("./td[9]/text()").extract_first() yield item
def parse_day(self, response): print(len(response.body)) city_name = response.meta['city'] node_list = response.xpath("//tbody/tr") # 30 node_list.pop(0) for node in node_list: item = AqiItem() item['city'] = city_name item['date'] = node.xpath("./td[1]/text()").extract_first() item['aqi'] = node.xpath("./td[2]/text()").extract_first() item['level'] = node.xpath("./td[3]/span/text()").extract_first() item['pm2_5'] = node.xpath("./td[4]/text()").extract_first() item['pm10'] = node.xpath("./td[5]/text()").extract_first() item['so2'] = node.xpath("./td[6]/text()").extract_first() item['co'] = node.xpath("./td[7]/text()").extract_first() item['no2'] = node.xpath("./td[8]/text()").extract_first() item['o3'] = node.xpath("./td[9]/text()").extract_first() print(item["city"]) yield item