def parse(self, response): for product in response.selector.css('div.block-producto'): scraped_product = Product() scraped_product.name = parse_name( product.css('a.catalogo_click_detail::attr(href)'). extract_first().split('/')[6]) scraped_product.href = product.css( 'a.catalogo_click_detail::attr(href)').extract()[0] scraped_product.price = product.css('table').extract()[0] scraped_product.image_href = product.css( 'img::attr(src)').extract()[0] scraped_product.category = "accessory" scraped_product.animal = "cat" product_list = parse_price_table(scraped_product) for final_product in product_list: item = ProjectPetsItem() item['name'] = parse_name(final_product.name) item['href'] = final_product.href item['price'] = final_product.price item['image_href'] = final_product.image_href item['store'] = final_product.store item['category'] = final_product.category item['animal'] = final_product.animal item['date'] = final_product.date item['date_str'] = final_product.date_str yield item next_page = response.css( 'a.fa-chevron-right::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): for product in response.selector.css('li.show-links-onimage'): item = ProjectPetsItem() item['name'] = parse_name(product.css('h2::text').extract()[0]) item['href'] = product.css('a::attr(href)').extract()[0] if product.css('del').extract_first( default='not-found') != 'not-found': price = product.css( 'span.price ins span.woocommerce-Price-amount::text' ).extract()[0] else: price = product.css( '.woocommerce-Price-amount::text').extract()[0] item['price'] = parse_price(price) item['image_href'] = product.css('img::attr(src)').extract()[0] item['store'] = "Day Mascotas" item['category'] = "accessory" item['animal'] = "unspecified" item['date'] = datetime.today() item['date_str'] = item['date'].strftime('%Y-%m-%d') yield item next_page = response.css('li.next a::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): for product in response.selector.css('div.product'): ## issues with the price extraction. HTML not organized and to get every price ## one has to enter the product page and click through some options item = ProjectPetsItem() item['name'] = parse_name( product.css('div.product-title a::text').extract()[0]) item['href'] = product.css( 'a.woocommerce-LoopProduct-link::attr(href)').extract()[0] item['price'] = parse_price( product.css('span.woocommerce-Price-amount::text').extract() [0]) item['image_href'] = product.css('img::attr(src)').extract()[0] item['store'] = "Noi" item['category'] = "food" item['animal'] = "dog" item['date'] = datetime.today() item['date_str'] = item['date'].strftime('%Y-%m-%d') yield item next_page = response.css('.next::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): url_animal = response.url.split("/")[4] url_category = response.url.split("/")[4] item_animal = '' item_category = '' if url_animal.count('perro'): item_animal = 'dog' elif url_animal.count('gato'): item_animal = 'cat' else: item_animal = 'unspecified' if url_category.count('alimentos') > 0: item_category = 'food' elif url_category.count('medicamentos') > 0: item_category = 'medicine' elif url_category.count('accesorios') > 0: item_category = 'accessory' for product in response.selector.css('li.show-links-onimage'): item = ProjectPetsItem() item['name'] = parse_name(product.css('h2::text').extract()[0]) item['href'] = product.css('a::attr(href)').extract()[0] if product.css('del').extract_first( default='not-found') != 'not-found': price = product.css( 'span.price ins span.woocommerce-Price-amount::text' ).extract()[0] else: price = product.css( '.woocommerce-Price-amount::text').extract()[0] item['price'] = parse_price(price) item['image_href'] = product.css('img::attr(src)').extract()[0] item['store'] = "Day Mascotas" item['category'] = item_category item['animal'] = item_animal item['date'] = datetime.today() item['date_str'] = item['date'].strftime('%Y-%m-%d') yield item next_page = response.css('li.next a::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): url_animal = response.url.split("/")[3] url_category = response.url.split("/")[4] item_animal = '' item_category = '' if url_animal == 'perros-2': item_animal = 'dog' elif url_animal == 'gatos-2': item_animal = 'cat' if url_category.count('alimentos') > 0: item_category = 'food' elif url_category.count('medicamentos') > 0: item_category = 'medicine' elif url_category.count('accesorios') > 0: item_category = 'accessory' for product in response.css('div.in'): item = ProjectPetsItem() item['name'] = parse_name( product.css('h1 a::text').extract_first()) item['href'] = "https://www.pethappy.cl" + product.css( 'p.foto a::attr(href)').extract()[0] item['price'] = parse_price( product.css('p.precio::text').extract()[0]) item['image_href'] = product.css('p a img::attr(src)').extract()[0] item['store'] = "Pet Happy" item['category'] = item_category item['animal'] = item_animal item['date'] = datetime.today() item['date_str'] = item['date'].strftime('%Y-%m-%d') yield item next_page = response.css('li.next a::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)
def parse(self, response): for product in response.css('div.in'): item = ProjectPetsItem() item['name'] = parse_name( product.css('h1 a::text').extract_first()) item['href'] = "https://www.pethappy.cl" + product.css( 'p.foto a::attr(href)').extract()[0] item['price'] = parse_price( product.css('p.precio::text').extract()[0]) item['image_href'] = product.css('p a img::attr(src)').extract()[0] item['store'] = "Pet Happy" item['category'] = "accessories" item['animal'] = "cat" item['date'] = datetime.today() item['date_str'] = item['date'].strftime('%Y-%m-%d') yield item next_page = response.css('li.next a::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse)