Python parse_name示例，project_pets.spiders.utils.parse_name Python示例

示例#1

0

显示文件

    def parse(self, response):
        for product in response.selector.css('div.block-producto'):
            scraped_product = Product()
            scraped_product.name = parse_name(
                product.css('a.catalogo_click_detail::attr(href)').
                extract_first().split('/')[6])
            scraped_product.href = product.css(
                'a.catalogo_click_detail::attr(href)').extract()[0]
            scraped_product.price = product.css('table').extract()[0]
            scraped_product.image_href = product.css(
                'img::attr(src)').extract()[0]
            scraped_product.category = "accessory"
            scraped_product.animal = "cat"

            product_list = parse_price_table(scraped_product)

            for final_product in product_list:
                item = ProjectPetsItem()
                item['name'] = parse_name(final_product.name)
                item['href'] = final_product.href
                item['price'] = final_product.price
                item['image_href'] = final_product.image_href
                item['store'] = final_product.store
                item['category'] = final_product.category
                item['animal'] = final_product.animal
                item['date'] = final_product.date
                item['date_str'] = final_product.date_str

                yield item

        next_page = response.css(
            'a.fa-chevron-right::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

示例#2

0

显示文件

    def parse(self, response):
        for product in response.selector.css('li.show-links-onimage'):
            item = ProjectPetsItem()

            item['name'] = parse_name(product.css('h2::text').extract()[0])
            item['href'] = product.css('a::attr(href)').extract()[0]

            if product.css('del').extract_first(
                    default='not-found') != 'not-found':
                price = product.css(
                    'span.price ins span.woocommerce-Price-amount::text'
                ).extract()[0]
            else:
                price = product.css(
                    '.woocommerce-Price-amount::text').extract()[0]
            item['price'] = parse_price(price)

            item['image_href'] = product.css('img::attr(src)').extract()[0]
            item['store'] = "Day Mascotas"
            item['category'] = "accessory"
            item['animal'] = "unspecified"
            item['date'] = datetime.today()
            item['date_str'] = item['date'].strftime('%Y-%m-%d')

            yield item

        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

示例#3

0

显示文件

文件： noi.py 项目： javier-gs00/Project-Pets-Crawlers

    def parse(self, response):
        for product in response.selector.css('div.product'):
            ## issues with the price extraction. HTML not organized and to get every price
            ## one has to enter the product page and click through some options
            item = ProjectPetsItem()

            item['name'] = parse_name(
                product.css('div.product-title a::text').extract()[0])
            item['href'] = product.css(
                'a.woocommerce-LoopProduct-link::attr(href)').extract()[0]
            item['price'] = parse_price(
                product.css('span.woocommerce-Price-amount::text').extract()
                [0])
            item['image_href'] = product.css('img::attr(src)').extract()[0]
            item['store'] = "Noi"
            item['category'] = "food"
            item['animal'] = "dog"
            item['date'] = datetime.today()
            item['date_str'] = item['date'].strftime('%Y-%m-%d')

            yield item

        next_page = response.css('.next::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

示例#4

0

显示文件

    def parse(self, response):
        url_animal = response.url.split("/")[4]
        url_category = response.url.split("/")[4]

        item_animal = ''
        item_category = ''

        if url_animal.count('perro'):
            item_animal = 'dog'
        elif url_animal.count('gato'):
            item_animal = 'cat'
        else:
            item_animal = 'unspecified'

        if url_category.count('alimentos') > 0:
            item_category = 'food'
        elif url_category.count('medicamentos') > 0:
            item_category = 'medicine'
        elif url_category.count('accesorios') > 0:
            item_category = 'accessory'

        for product in response.selector.css('li.show-links-onimage'):
            item = ProjectPetsItem()

            item['name'] = parse_name(product.css('h2::text').extract()[0])
            item['href'] = product.css('a::attr(href)').extract()[0]

            if product.css('del').extract_first(
                    default='not-found') != 'not-found':
                price = product.css(
                    'span.price ins span.woocommerce-Price-amount::text'
                ).extract()[0]
            else:
                price = product.css(
                    '.woocommerce-Price-amount::text').extract()[0]
            item['price'] = parse_price(price)

            item['image_href'] = product.css('img::attr(src)').extract()[0]
            item['store'] = "Day Mascotas"
            item['category'] = item_category
            item['animal'] = item_animal
            item['date'] = datetime.today()
            item['date_str'] = item['date'].strftime('%Y-%m-%d')

            yield item

        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

示例#5

0

显示文件

文件： pethappy.py 项目： javier-gs00/Project-Pets-Crawlers

    def parse(self, response):
        url_animal = response.url.split("/")[3]
        url_category = response.url.split("/")[4]

        item_animal = ''
        item_category = ''

        if url_animal == 'perros-2':
            item_animal = 'dog'
        elif url_animal == 'gatos-2':
            item_animal = 'cat'

        if url_category.count('alimentos') > 0:
            item_category = 'food'
        elif url_category.count('medicamentos') > 0:
            item_category = 'medicine'
        elif url_category.count('accesorios') > 0:
            item_category = 'accessory'

        for product in response.css('div.in'):
            item = ProjectPetsItem()

            item['name'] = parse_name(
                product.css('h1 a::text').extract_first())
            item['href'] = "https://www.pethappy.cl" + product.css(
                'p.foto a::attr(href)').extract()[0]
            item['price'] = parse_price(
                product.css('p.precio::text').extract()[0])
            item['image_href'] = product.css('p a img::attr(src)').extract()[0]
            item['store'] = "Pet Happy"
            item['category'] = item_category
            item['animal'] = item_animal
            item['date'] = datetime.today()
            item['date_str'] = item['date'].strftime('%Y-%m-%d')

            yield item

        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

示例#6

0

显示文件

文件： pethappy.py 项目： javier-gs00/Project-Pets-Crawlers

    def parse(self, response):
        for product in response.css('div.in'):
            item = ProjectPetsItem()

            item['name'] = parse_name(
                product.css('h1 a::text').extract_first())
            item['href'] = "https://www.pethappy.cl" + product.css(
                'p.foto a::attr(href)').extract()[0]
            item['price'] = parse_price(
                product.css('p.precio::text').extract()[0])
            item['image_href'] = product.css('p a img::attr(src)').extract()[0]
            item['store'] = "Pet Happy"
            item['category'] = "accessories"
            item['animal'] = "cat"
            item['date'] = datetime.today()
            item['date_str'] = item['date'].strftime('%Y-%m-%d')

            yield item

        next_page = response.css('li.next a::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)