Пример #1
0
    def parse_url(response):
        title = response.xpath(
            '//h1[@class="title-page"]/text()').extract_first()
        model = response.xpath(
            '//meta[@itemprop="model"]/@content').extract_first()
        s = response.xpath(
            '//span[@class="or_price price"]/text()').extract_first()
        ss = filter(lambda x: x.isdigit(), s)
        price = "".join(ss)
        desc = response.xpath(
            '//div[@class="span7 offset1"]/*[not(self::div[@class="buy"]) and not(self::div[@class="row"]) and not(self::div[@class="clearfix"])]'
        ).extract()
        description = "".join(desc)
        manufacturer = response.xpath(
            '//meta[@itemprop="manufacturer"]/@content').extract_first()
        image = response.xpath(
            '//meta[@itemprop="image"]/@content').extract_first()
        category = response.xpath(
            '//span[@typeof="v:Breadcrumb" and position() = (last()-1)]/a/@alt'
        ).extract_first()

        item = CarItem()
        item['model'] = model
        item['title'] = title
        item['price'] = price
        item['description'] = description
        item['manufacturer'] = manufacturer
        item['image'] = image
        item['category'] = category
        yield item
Пример #2
0
 def parse(self, response):
     results=response.selector.xpath('//div[@class="search-result-list-item"]')
     for result in results:
         item=CarItem()
         item['car_name']=result.xpath("./a/p[@class='cx-name text-hover']/text()").extract()
         item['max_price']=result.xpath("./a/p[@class='cx-price']/text()").extract()[0].split('-')[0]
         item['min_price']=result.xpath("./a/p[@class='cx-price']/text()").extract()[0].split('-')[1][:-2]
         item['pic_src']=result.xpath("./a/img/@src").extract()
         yield item
Пример #3
0
    def parse(self, response):
        datas = response.xpath(
            "//div[@class='column grid-16']/div[@class='uibox']")[1:]
        for data in datas:
            cate = data.xpath(
                ".//div[@class='uibox-title']/a[1]/text()").get().strip()
            urls = data.xpath(".//ul/li/a/img/@src").getall()
            urls = list(map(lambda url: response.urljoin(url), urls))

            print("=" * 30)
            print(cate)
            print("=" * 30)
            item = CarItem(cate=cate, urls=urls)
            yield item
Пример #4
0
 def parse_item(self, response):
     item = CarItem()
     item['name'] = response.xpath(
         '//span[@class="yiche-breadcrumb_item-txt"]/text()').get().strip()
     if response.xpath('//div[contains(@class,"zaishou")]'):
         item['ref_price'] = response.xpath(
             '//span[@class="ref-price"]/text()').get().strip()
         item['guide_price'] = response.xpath(
             '//span[@class="guide-price"]/text()').get().strip()
         item['sell_now'] = "在售"
     elif response.xpath('//div[contains(@class,"weishangshi")]'):
         item['ref_price'] = "无"
         item['guide_price'] = "无"
         item['sell_now'] = "未上市"
     return item
Пример #5
0
    def parse(self, response):
        item = CarItem()  #实例化的一个数据字典对象用于存储数据
        car = response.xpath('//div[@class="tbA"]/ul/li')  #当前页20个节点对象

        for each in car:  #遍历并取其对应节点数据值

            item['ranking'] = each.xpath('./span/text()').extract()[0]
            item['car_name'] = each.xpath(
                './div[@class="info"]/p[@class="sname"]/a/text()').extract()[0]
            item['price'] = each.xpath(
                './div[@class="info"]/p[@class="col col1 price"]/em/text()'
            ).extract()[0]
            item['hot'] = each.xpath(
                './div[@class="info"]/p[@class="col rank"]/span[@class="fl red rd-mark"]/text()'
            ).extract()[0]
            item['brand'] = each.xpath(
                './div[@class="info"]/p[@class="col col1"][1]/text()').extract(
                )[0]
            item['style'] = each.xpath(
                './div[@class="info"]/p[@class="col"][1]/text()').extract()[0]
            item['dispt'] = each.xpath(
                './div[@class="info"]/p[@class="col col1"]/em')
            item['gear'] = each.xpath(
                './div[@class="info"]/p[@class="col"]/em')

            # dispt排量、gear变速箱的值可能为空,直接赋值可能抛异常,必须对其判断后在赋值
            if len(item['dispt']) != 0:
                item['dispt'] = each.xpath(
                    './div[@class="info"]/p[@class="col col1"]/em')[0].xpath(
                        'string(.)').extract()[0]
            else:
                item['dispt'] = '暂无信息'

            if len(item['gear']) != 0:
                item['gear'] = each.xpath(
                    './div[@class="info"]/p[@class="col"]/em')[0].xpath(
                        'string(.)').extract()[0]
            else:
                item['gear'] = '暂无信息'

            yield item  #返回字典携带的数据

        if self.offset < 30:  #获取后续页面数据
            self.offset += 1
            url = 'https://price.pcauto.com.cn/top/k75-p{0}.html'.format(
                str(self.offset))
            self.url = url
            yield scrapy.Request(self.url, callback=self.parse)  #递归调用,发送请求
Пример #6
0
    def parseItem(self, response):
        divs = response.xpath('//*[@id="mail_parent"]').getall()
        for div in divs:
            div = scrapy.Selector(text=div)
            if ("Số Km đã đi:" in div.xpath('//label/text()').get()):
                km = div.xpath('//span/text()').get()
            elif ("Hộp số:" in div.xpath('//label/text()').get()):
                car_type = div.xpath('//span/text()').get()
            elif ("Động cơ:" in div.xpath('//label/text()').get()):
                engine_type = div.xpath('//span/text()').get()

        desc = (' ').join(
            response.xpath(
                '//div[contains(@class,"car_des")]/div/text()').getall())
        subject = response.xpath(
            '//div[contains(@class,"title")]/h1/text()').get()
        # convert dd/mm/yyyy to unixtime
        publish_time = time.mktime(
            datetime.datetime.strptime(
                response.xpath('//div[contains(@class,"title")]/div/text()').
                get().split()[2], "%d/%m/%Y").timetuple())
        divs = response.xpath(
            '//div[contains(@class,"contact-txt")]//text()').getall()
        phone_number = ''.join(re.findall('\d+', divs[3]))
        address = divs[-1]
        images = json.dumps(
            response.xpath(
                '//div[contains(@class,"highslide-gallery")]//img/@src').
            getall())
        car_brand = response.xpath(
            '//*[@id="wrapper"]/div[2]/span[4]/a/span/strong//text()').get()
        org_link = response.url
        carItem = CarItem(car_brand=car_brand,
                          km=km,
                          car_type=car_type,
                          engine_type=engine_type,
                          desc=desc,
                          subject=subject,
                          publish_time=publish_time,
                          phone_number=phone_number,
                          address=address,
                          org_link=org_link,
                          images=images)
        yield carItem
Пример #7
0
 def parse3(self, response):
     self.logging.info("url is: %s." % response.url)
     try:
         item = CarItem()
         urls = response.xpath(
             "//div[@class='title-name name-width-01']/a/@href").extract()
         website_name = response.xpath(
             "//div[@class='subnav-title-name']/a").extract()
         if website_name:
             website_name = common_tools.data_cleaning(website_name[0])
             item['website_name'] = website_name
         website_price = response.xpath(
             "//div[@class='price']/span[@class='font-16']").extract()
         if website_price:
             website_price = re.sub('<.*?>|\r|\n|\t', '', website_price[0])
             item['website_price'] = website_price
         website_score = response.xpath(
             "//span[@class='font-arial number-fen']").extract()
         if website_score:
             website_score = re.sub('<.*?>|\r|\n|\t', '', website_score[0])
             item['website_score'] = website_score
         website_feeling = response.xpath(
             "//div[@class='revision-impress impress-small']").extract()
         if website_feeling:
             website_feeling = re.sub('<.*?>|\r|\n|\t| ', '',
                                      website_feeling[0])
             item['website_feeling'] = website_feeling
         if urls:
             for url in urls:
                 url = "http:" + url
                 request = Request(url,
                                   meta={'item': item},
                                   callback=self.parse4)
                 yield request
     except:
         self.logging.error("layer 3 failed, url is:(%s), error info is:(%s)" \
                        % (response.url, traceback.format_exc()))
Пример #8
0
 def parse(self, response):
     jsonRes = json.loads(response.body)
     for item in jsonRes["ads"]:
         print("################################################")
         print(item)
         print("################################################")
         desc = item["body"]
         subject = item["subject"]
         region_name = item["region_name"]
         price = item["price"]
         images = []
         images.append(item["image"])
         images = json.dumps(images)
         phone_number = ""
         address = ""
         publish_time = time.time()
         km = ""
         car_type = ""
         engine_type = ""
         org_link = self.allowed_domains[0] + '/' + str(
             item["list_id"]) + ".htm"
         carBrandCode = re.findall(r'carbrand=[0-9]+',
                                   response.url)[0].split("=")[1]
         car_brand = get_project_settings().get("CAR_BRAND")[carBrandCode]
         carItem = CarItem(car_brand=car_brand,
                           km=km,
                           car_type=car_type,
                           engine_type=engine_type,
                           desc=desc,
                           subject=subject,
                           publish_time=publish_time,
                           phone_number=phone_number,
                           address=region_name,
                           org_link=org_link,
                           images=images,
                           price=price)
         yield carItem