def parse_url(response): title = response.xpath( '//h1[@class="title-page"]/text()').extract_first() model = response.xpath( '//meta[@itemprop="model"]/@content').extract_first() s = response.xpath( '//span[@class="or_price price"]/text()').extract_first() ss = filter(lambda x: x.isdigit(), s) price = "".join(ss) desc = response.xpath( '//div[@class="span7 offset1"]/*[not(self::div[@class="buy"]) and not(self::div[@class="row"]) and not(self::div[@class="clearfix"])]' ).extract() description = "".join(desc) manufacturer = response.xpath( '//meta[@itemprop="manufacturer"]/@content').extract_first() image = response.xpath( '//meta[@itemprop="image"]/@content').extract_first() category = response.xpath( '//span[@typeof="v:Breadcrumb" and position() = (last()-1)]/a/@alt' ).extract_first() item = CarItem() item['model'] = model item['title'] = title item['price'] = price item['description'] = description item['manufacturer'] = manufacturer item['image'] = image item['category'] = category yield item
def parse(self, response): results=response.selector.xpath('//div[@class="search-result-list-item"]') for result in results: item=CarItem() item['car_name']=result.xpath("./a/p[@class='cx-name text-hover']/text()").extract() item['max_price']=result.xpath("./a/p[@class='cx-price']/text()").extract()[0].split('-')[0] item['min_price']=result.xpath("./a/p[@class='cx-price']/text()").extract()[0].split('-')[1][:-2] item['pic_src']=result.xpath("./a/img/@src").extract() yield item
def parse(self, response): datas = response.xpath( "//div[@class='column grid-16']/div[@class='uibox']")[1:] for data in datas: cate = data.xpath( ".//div[@class='uibox-title']/a[1]/text()").get().strip() urls = data.xpath(".//ul/li/a/img/@src").getall() urls = list(map(lambda url: response.urljoin(url), urls)) print("=" * 30) print(cate) print("=" * 30) item = CarItem(cate=cate, urls=urls) yield item
def parse_item(self, response): item = CarItem() item['name'] = response.xpath( '//span[@class="yiche-breadcrumb_item-txt"]/text()').get().strip() if response.xpath('//div[contains(@class,"zaishou")]'): item['ref_price'] = response.xpath( '//span[@class="ref-price"]/text()').get().strip() item['guide_price'] = response.xpath( '//span[@class="guide-price"]/text()').get().strip() item['sell_now'] = "在售" elif response.xpath('//div[contains(@class,"weishangshi")]'): item['ref_price'] = "无" item['guide_price'] = "无" item['sell_now'] = "未上市" return item
def parse(self, response): item = CarItem() #实例化的一个数据字典对象用于存储数据 car = response.xpath('//div[@class="tbA"]/ul/li') #当前页20个节点对象 for each in car: #遍历并取其对应节点数据值 item['ranking'] = each.xpath('./span/text()').extract()[0] item['car_name'] = each.xpath( './div[@class="info"]/p[@class="sname"]/a/text()').extract()[0] item['price'] = each.xpath( './div[@class="info"]/p[@class="col col1 price"]/em/text()' ).extract()[0] item['hot'] = each.xpath( './div[@class="info"]/p[@class="col rank"]/span[@class="fl red rd-mark"]/text()' ).extract()[0] item['brand'] = each.xpath( './div[@class="info"]/p[@class="col col1"][1]/text()').extract( )[0] item['style'] = each.xpath( './div[@class="info"]/p[@class="col"][1]/text()').extract()[0] item['dispt'] = each.xpath( './div[@class="info"]/p[@class="col col1"]/em') item['gear'] = each.xpath( './div[@class="info"]/p[@class="col"]/em') # dispt排量、gear变速箱的值可能为空,直接赋值可能抛异常,必须对其判断后在赋值 if len(item['dispt']) != 0: item['dispt'] = each.xpath( './div[@class="info"]/p[@class="col col1"]/em')[0].xpath( 'string(.)').extract()[0] else: item['dispt'] = '暂无信息' if len(item['gear']) != 0: item['gear'] = each.xpath( './div[@class="info"]/p[@class="col"]/em')[0].xpath( 'string(.)').extract()[0] else: item['gear'] = '暂无信息' yield item #返回字典携带的数据 if self.offset < 30: #获取后续页面数据 self.offset += 1 url = 'https://price.pcauto.com.cn/top/k75-p{0}.html'.format( str(self.offset)) self.url = url yield scrapy.Request(self.url, callback=self.parse) #递归调用,发送请求
def parseItem(self, response): divs = response.xpath('//*[@id="mail_parent"]').getall() for div in divs: div = scrapy.Selector(text=div) if ("Số Km đã đi:" in div.xpath('//label/text()').get()): km = div.xpath('//span/text()').get() elif ("Hộp số:" in div.xpath('//label/text()').get()): car_type = div.xpath('//span/text()').get() elif ("Động cơ:" in div.xpath('//label/text()').get()): engine_type = div.xpath('//span/text()').get() desc = (' ').join( response.xpath( '//div[contains(@class,"car_des")]/div/text()').getall()) subject = response.xpath( '//div[contains(@class,"title")]/h1/text()').get() # convert dd/mm/yyyy to unixtime publish_time = time.mktime( datetime.datetime.strptime( response.xpath('//div[contains(@class,"title")]/div/text()'). get().split()[2], "%d/%m/%Y").timetuple()) divs = response.xpath( '//div[contains(@class,"contact-txt")]//text()').getall() phone_number = ''.join(re.findall('\d+', divs[3])) address = divs[-1] images = json.dumps( response.xpath( '//div[contains(@class,"highslide-gallery")]//img/@src'). getall()) car_brand = response.xpath( '//*[@id="wrapper"]/div[2]/span[4]/a/span/strong//text()').get() org_link = response.url carItem = CarItem(car_brand=car_brand, km=km, car_type=car_type, engine_type=engine_type, desc=desc, subject=subject, publish_time=publish_time, phone_number=phone_number, address=address, org_link=org_link, images=images) yield carItem
def parse3(self, response): self.logging.info("url is: %s." % response.url) try: item = CarItem() urls = response.xpath( "//div[@class='title-name name-width-01']/a/@href").extract() website_name = response.xpath( "//div[@class='subnav-title-name']/a").extract() if website_name: website_name = common_tools.data_cleaning(website_name[0]) item['website_name'] = website_name website_price = response.xpath( "//div[@class='price']/span[@class='font-16']").extract() if website_price: website_price = re.sub('<.*?>|\r|\n|\t', '', website_price[0]) item['website_price'] = website_price website_score = response.xpath( "//span[@class='font-arial number-fen']").extract() if website_score: website_score = re.sub('<.*?>|\r|\n|\t', '', website_score[0]) item['website_score'] = website_score website_feeling = response.xpath( "//div[@class='revision-impress impress-small']").extract() if website_feeling: website_feeling = re.sub('<.*?>|\r|\n|\t| ', '', website_feeling[0]) item['website_feeling'] = website_feeling if urls: for url in urls: url = "http:" + url request = Request(url, meta={'item': item}, callback=self.parse4) yield request except: self.logging.error("layer 3 failed, url is:(%s), error info is:(%s)" \ % (response.url, traceback.format_exc()))
def parse(self, response): jsonRes = json.loads(response.body) for item in jsonRes["ads"]: print("################################################") print(item) print("################################################") desc = item["body"] subject = item["subject"] region_name = item["region_name"] price = item["price"] images = [] images.append(item["image"]) images = json.dumps(images) phone_number = "" address = "" publish_time = time.time() km = "" car_type = "" engine_type = "" org_link = self.allowed_domains[0] + '/' + str( item["list_id"]) + ".htm" carBrandCode = re.findall(r'carbrand=[0-9]+', response.url)[0].split("=")[1] car_brand = get_project_settings().get("CAR_BRAND")[carBrandCode] carItem = CarItem(car_brand=car_brand, km=km, car_type=car_type, engine_type=engine_type, desc=desc, subject=subject, publish_time=publish_time, phone_number=phone_number, address=region_name, org_link=org_link, images=images, price=price) yield carItem