def shuichanpin_parse(self, response): now = time.strftime('%Y-%m-%d', time.localtime()) json_response = json.loads(response.body) for num in range(0, 18): farm_item = FarmItem() farm_item['province'] = "山东" farm_item['market'] = "青岛市城阳蔬菜水产品批发市场" farm_item['typy'] = json_response["list"][num]["PSort"] farm_item['name'] = json_response["list"][num]["PName"] farm_item['standard'] = "none" farm_item['area'] = "华东" farm_item['color'] = "none" farm_item['unit'] = "元/公斤" farm_item['minPrice'] = json_response["list"][num]["LPrice"] farm_item['avgPrice'] = json_response["list"][num]["PPrice"] farm_item['maxPrice'] = json_response["list"][num]["MPrice"] farm_item['entertime'] = now farm_item['time'] = json_response["list"][num]["ReleaseTime"] if time.mktime( time.strptime(farm_item['time'], "%Y-%m-%d")) > self.today - self.crawl_day: yield farm_item # yield farm_item self.shuichanpin_current_num += 1 if self.shuichanpin_current_num != self.max_crawl_num: yield scrapy.FormRequest(url='http://www.cncyms.cn/pages.php', formdata={ "pageNum": str(self.shuichanpin_current_num), "pname": "", "reltime": "水产品" }, callback=self.shuichanpin_parse)
def parse(self, response): now = time.strftime('%Y-%m-%d', time.localtime()) json_response = json.loads(response.body) for num in range(0, 18): farm_item = FarmItem() farm_item['province'] = "山东" farm_item['market'] = "青岛市城阳蔬菜水产品批发市场" farm_item['typy'] = json_response["list"][num]["PSort"] farm_item['name'] = json_response["list"][num]["PName"] farm_item['standard'] = "none" farm_item['area'] = "华东" farm_item['color'] = "none" farm_item['unit'] = "元/公斤" farm_item['minPrice'] = json_response["list"][num]["LPrice"] farm_item['avgPrice'] = json_response["list"][num]["PPrice"] farm_item['maxPrice'] = json_response["list"][num]["MPrice"] farm_item['entertime'] = now farm_item['time'] = json_response["list"][num]["ReleaseTime"] yield farm_item self.current_num+=1 print("=====================crawl:" + str(self.current_num)) if self.current_num!=2942: yield scrapy.FormRequest( url='http://www.cncyms.cn/pages.php', formdata={"pageNum": str(self.current_num), "pname": "", "reltime": "副食品"}, callback=self.parse )
def pricePage_parse(self, response): print("------价格页面解析函数------") item_list = response.xpath("//table[@class='f_s_14']/tr") for i_item in item_list: farm_item = FarmItem() #Farmitem实例化对象 farm_item['province'] = "黑龙江" farm_item['market'] = "牡丹江地利农副产品有限公司" farm_item['typy'] = "蔬菜" farm_item['name'] = i_item.xpath( "./td[5]/a/text()").extract_first()[11:-4] farm_item['standard'] = "none" farm_item['area'] = "东北" farm_item['color'] = "none" farm_item['unit'] = "元/斤" farm_item['minPrice'] = i_item.xpath( "./td[2]/text()").extract_first()[1:] farm_item['avgPrice'] = i_item.xpath( "./td[4]/text()").extract_first()[1:] farm_item['maxPrice'] = i_item.xpath( "./td[3]/text()").extract_first()[1:] farm_item['entertime'] = self.now farm_item['time'] = i_item.xpath("./td[1]/text()").extract_first() #print(farm_item) yield farm_item current_page = response.xpath( "//div[@id='pager']/strong/text()").extract_first() print("正在补充当前品种的所有url") page_list = response.xpath("//div[@id='pager']/a") for page in page_list: yield scrapy.Request("http://www.vipveg.com" + page.xpath("./@href").extract_first(), callback=self.pricePage_parse)
def parse(self, response): print("----------正在解析首页-----------") item_list = response.xpath( "//td[@class='borderTop p_5']/table/tr/td/a") # print(item_list) for i_item in item_list: city = i_item.xpath("./text()").extract_first() area = self.getArea(city) farm_item = FarmItem() farm_item['province'] = city farm_item['area'] = area farm_item['typy'] = "蔬菜" farm_item['standard'] = "none" farm_item['color'] = "none" farm_item['unit'] = "元/斤" farm_item['entertime'] = self.now if area != "异常": yield scrapy.Request("http://www.vipveg.com" + i_item.xpath("./@href").extract_first(), meta={'item': farm_item}, callback=self.provinceIndexParse)