Python price_normalize примеры, text_parser.price_normalize Python примеры использования

Пример #1

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         # wait for sizes and color data to load
         sizes_path = './/div[@role="option"]/text()'
         self.is_visible(sizes_path, timeout=2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'prune'
         item['breadcrumb'] = sel.xpath('.//title/text()').extract()[0].split(' ', 1)[0]
         item['title'] = sel.xpath('.//div[@class="page-title-wrapper product"]/h1/span/text()').extract()[0]
         sizes = sel.xpath(sizes_path).extract()
         item['sizes'] = sizes
         item['color'] = sel.xpath('.//div[@class="swatch-option color selected"]/@aria-label').extract()
         description = sel.xpath('.//div[@class="product attribute description"]/div/ul').extract()
         item['description'] = html_text_normalize(description)
         item['code'] = sel.xpath('.//div[@itemprop="sku"]/text()').extract()[0]
         price_str = sel.xpath('.//span[@class="price"]/text()').extract()[0]
         item['price'] = price_normalize(price_str)
         item['other'] = None
         item['image_urls'] = sel.xpath('.//img[@class="img-responsive"]/@src').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #2

0

Показать файл

 def parse_item(self, response):
     print("------------- New Item ----------------")
     self.browser.get(response.url)
     time.sleep(5)
     source = self.browser.page_source
     sel = Selector(text=source)
     item = Item()
     item['created_at'] = datetime.now()
     item['url'] = response.url
     item['brand'] = 'mishka'
     item['breadcrumb'] = []
     title = sel.xpath('.//span[@itemprop="name"]/text()').extract()
     item['title'] = html_text_normalize(title)
     item['description'] = html_text_normalize(sel.xpath('.//div[@class="product attribute description"]/div/text()') \
         .extract())
     item['code'] = sel.xpath('.//div[@itemprop="sku"]/text()').extract()[0]
     price = sel.xpath(
         './/span[contains(@id,"product-price")]/span/text()').extract()[0]
     item['price'] = price_normalize(price)
     sizes = sel.xpath('.//div[@class="swatch-attribute size"]/div[@class="swatch-attribute-options clearfix"]/div/text()') \
         .extract()
     item['sizes'] = sizes_normalize(sizes)
     img_urls_prefix = sel.xpath('.//img[contains(@src, "https://www.mishka.com.ar/media/catalog/product/cache/") and not(contains(@src, "thumb"))]/@src') \
         .extract()[0][:-5]
     thumbnails = len(
         sel.xpath('.//img[contains(@src, "thumb")]/@src').extract())
     img_urls = list()
     for thumb in range(thumbnails):
         img_url = img_urls_prefix + str(thumb) + '.jpg'
         img_urls.append(img_url)
     item['image_urls'] = img_urls
     yield item

Пример #3

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'dos61'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/h1[@itemprop="name"]/text()').extract()[0]
         description = html_text_normalize(
             sel.xpath('.//div[@class="content"]//text()').extract())
         item['description'] = description
         item['code'] = ''
         price = sel.xpath(
             './/p[@class="price"]//span[@class="woocommerce-Price-amount amount"]/text()'
         ).extract()
         if len(price) > 1:
             price = price[len(price) - 1]
         else:
             price = price[0]
         item['price'] = price_normalize(price)
         item['sizes'] = sel.xpath(
             './/div[@data-attribute="pa_talle"]/span[contains(@class, "ivpa_instock")]/@data-term'
         ).extract()
         item['image_urls'] = sel.xpath(
             './/a[@data-slide-index]/img/@src').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #4

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'vitamina'
         item['breadcrumb'] = []
         item['title'] = sel.xpath('.//h1[@id="nombreProducto"]/text()').extract()[0]
         description = html_text_normalize(sel.xpath('.//p[@itemprop="description"]/text()').extract())
         item['description'] = description
         item['code'] = ''
         price = sel.xpath('.//section[@id="datos"]//p[@class="special-price"]/span[@itemprop="price" and @class="price"]/@content').extract()
         if len(price) > 0:
             price = price[0]
         else:
             price = sel.xpath('.//span[@itemprop="price"]/@content').extract()[0]
         item['price'] = price_normalize(price)
         sizes = sel.xpath('.//li[@class="swatchContainer"]/div[@class="swatch"]/text()').extract()
         item['sizes'] = sizes
         item['image_urls'] = sel.xpath('.//div[@class="fotozoom"]/img[@class="zoomImg"]/@src').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #5

0

Показать файл

Файл: lazaro.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'lazaro'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/div[@class="product-main-info text-center"]//h1/text()'
         ).extract()[0]
         item['description'] = html_text_normalize(
             sel.xpath('.//div[@id="collapseOne"]/div/text()').extract())
         item['code'] = sel.xpath(
             './/div[@class="sku"]/text()').extract()[0].replace(
                 'SKU# ', '')
         item['price'] = price_normalize(
             sel.xpath('.//span[@class="price"]/text()').extract()[0])
         sizes = sel.xpath(
             './/div[@class="amconf-images-container switcher-field"]//label[not(contains(@class,"no-stock"))]/text()'
         ).extract()
         item['sizes'] = sizes
         item['image_urls'] = sel.xpath(
             './/div[@id="gallery_01"]//li/a/@data-image').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #6

0

Показать файл

 def parse_item(self, response):
     print("------------- New Item ----------------")
     self.browser.get(response.url)
     source = self.browser.page_source
     sel = Selector(text=source)
     item = Item()
     item['created_at'] = datetime.now()
     item['url'] = response.url
     item['brand'] = 'lucianomarra'
     item['breadcrumb'] = []
     title = sel.xpath('.//h1[@itemprop="name"]/span/text()').extract()
     item['title'] = html_text_normalize(title)
     item['description'] = html_text_normalize(sel.xpath('.//div[@id="tab-description"]//text()').extract())
     item['code'] = ''
     price = sel.xpath('.//div[@class="product-price"]/p/ins/span/text()').extract()
     if len(price) == 0:
         price = sel.xpath('.//div[@class="product-price"]/p/span/text()').extract()[0]
     else:
         price = price[0]
     item['price'] = price_normalize(price)
     sizes = sel.xpath('.//div[@class="select_option_label select_option"]/span/text()').extract()
     item['sizes'] = sizes_normalize(sizes)
     img_urls = sel.xpath('.//div[@class="images"]//a[@itemprop="image"]/@href').extract()
     if len(img_urls) ==0:
         img_urls = sel.xpath('.//div[@class="caroufredsel_wrapper"]//li/a/@href').extract()
     item['image_urls'] = img_urls
     yield item

Пример #7

0

Показать файл

Файл: margiefranzini.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     print("------------- New Item ----------------")
     self.browser.get(response.url)
     source = self.browser.page_source
     sel = Selector(text=source)
     item = Item()
     item['created_at'] = datetime.now()
     item['url'] = response.url
     item['brand'] = 'margiefranzini'
     item['breadcrumb'] = []
     title = sel.xpath('.//h1[@class="title border"]/text()').extract()[0]
     item['title'] = title.replace(' Margie Franzini Shoes ',
                                   ' ').replace(' Margie Franzini ', ' ')
     item['description'] = html_text_normalize(
         sel.xpath('.//article[@id="tabDescription"]/p/text()').extract())
     item['code'] = ''
     price = sel.xpath(
         './/dl[@class="priceInfo clearfix promotionPrice"]//span[@class="ch-price price"]/text()'
     ).extract()
     if len(price) == 0:
         price = sel.xpath(
             './/span[@class="ch-price price"]/text()').extract()[0]
     else:
         price = price[0]
     item['price'] = price_normalize(price)
     sizes = sel.xpath(
         './/menu/li/span[not(contains(text(),"Talle"))]/text()').extract()
     if len(sizes) == 0:
         sizes = sel.xpath(
             './/span[@data-idx="1" and contains(text(),"Talle")]/text()'
         ).extract()
     item['sizes'] = sizes_normalize(sizes)
     img_urls = sel.xpath('.//li[@role="listitem"]/img/@src').extract()
     item['image_urls'] = [url[2:] for url in img_urls]
     yield item

Пример #8

0

Показать файл

Файл: febo.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'febo'
         item['breadcrumb'] = sel.xpath(
             './/a[contains(@href,"javascript:Form")]/text()').extract()
         item['title'] = sel.xpath('.//articulo_det/text()').extract()[0]
         description = sel.xpath('.//descripcion_det/p/text()').extract()
         item['description'] = html_text_normalize(description)
         item['code'] = sel.xpath('.//articulo_det/text()').extract()[0]
         item['price'] = price_normalize(
             sel.xpath('.//precio_det[@id="preciohtml"]/text()').extract()
             [0])
         sizes = sel.xpath(
             './/div[@class="talles" and img/@src="img/btn_S.jpg"]/span/text()'
         ).extract()
         item['sizes'] = sizes
         item['other'] = None
         item['image_urls'] = ['https://zapateriafebo.com/' + url for url in \
                                           sel.xpath('.//foto_principal/img/@src').extract()]
         yield item
     else:
         print("-------------- OLD -------------")

Пример #9

0

Показать файл

Файл: cestfini.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'cestfini'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/h1[@itemprop="name"]/text()').extract()[0]
         description = sel.xpath(
             './/div[@class="descripcion_cestfini"]//text()').extract()
         description = html_text_normalize(description)
         item['description'] = description
         item['code'] = ''
         price = sel.xpath(
             './/div[@class="span4 force100ipad"]//span[@id="price_display"]/text()'
         ).extract()[0]
         item['price'] = price_normalize(price)
         item['sizes'] = list(
             set(
                 sel.xpath(
                     './/div[@data-variant="Talle"]//span[@class="custom-variants" and not(@style)]/text()'
                 ).extract()))
         item['image_urls'] = [
             url[2:] for url in sel.xpath(
                 './/a[@class="cloud-zoom-gallery"]/@href').extract()
         ]
         yield item
     else:
         print("-------------- OLD -------------")

Пример #10

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'blaque'
         item['breadcrumb'] = ''
         item['title'] = sel.xpath('.//h2[@class="tituloproducto"]/text()').extract()[0]
         description = sel.xpath('.//p[@class="descri"]/text()').extract()
         if len(description) > 0:
             item['description'] = html_text_normalize(description)
         item['code'] = sel.xpath('.//span[@class="numart"]/text()').extract()[0]
         item['price'] = price_normalize(sel.xpath('.//div[@class="descprod"]//div[@class="price-box"]//span[@itemprop="price"]/@content').extract()[0])
         sizes = sel.xpath('.//div[@class="swatchesContainer"]//li/div[not(contains(@class, "disabledSwatch"))]/text()').extract()
         item['sizes'] = sizes
         img_urls = sel.xpath('.//ul[@id="ul-moreviews"]//a[@class="cloud-zoom-gallery"]/@href').extract()
         item['image_urls'] = img_urls
         yield item
     else:
         print("-------------- OLD -------------")

Пример #11

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(1)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'batistella'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/h1[@itemprop="name"]/text()').extract()[0]
         description = html_text_normalize(
             sel.xpath('.//div[@class="column push-1-16 col-10-12"]/p/text()').extract() + \
             sel.xpath('.//table[@class="table-right"]//text()').extract()
             )
         item['description'] = description
         item['code'] = sel.xpath(
             './/span[@itemprop="sku"]/text()').extract()[0]
         item['price'] = price_normalize(
             sel.xpath('.//span[@itemprop="price"]/text()').extract()[0])
         sizes = sel.xpath(
             './/select[@class="form-control attribute_select"]/option[@value!=0]/text()'
         ).extract()
         item['sizes'] = sizes_normalize(sizes)
         item['image_urls'] = sel.xpath(
             './/img[@data-src]/@data-src').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #12

0

Показать файл

Файл: lucerna.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'lucerna'
         title = sel.xpath('.//h1[@itemprop="name"]/text()').extract()[0]
         item['title'] = title
         item['breadcrumb'] = [title.split(' ', 1)[0]]
         item['description'] = html_text_normalize(
             sel.xpath(
                 './/div[@class="description user-content clear"]/p/text()'
             ).extract())
         item['code'] = None
         price = price_normalize(
             sel.xpath('.//span[@id="price_display"]/text()').extract()[0])
         item['price'] = price
         sizes = sel.xpath(
             './/a[contains(@class,"insta-variations Talle") and span/@class="custom-variants"]/@data-option'
         ).extract()[0:5]
         item['sizes'] = sizes
         item['other'] = None
         img_urls = sel.xpath(
             './/a[contains(@class,"cloud-zoom") and not(contains(@rel,"position"))]/@href'
         ).extract()
         img_urls = list(map((lambda x: x[2:]), img_urls))
         item['image_urls'] = img_urls
         yield item
     else:
         print("-------------- OLD -------------")

Пример #13

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.get_with_short_wait(10, response.url)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'sofimartire'
         title = sel.xpath(
             './/div[@class="product-main-info"]//h1/text()').extract()[0]
         item['breadcrumb'] = [title.split(' ', 1)[0]]
         item['title'] = title
         item['description'] = html_text_normalize(
             sel.xpath('.//div[@id="collapseOne"]/div/text()').extract())
         item['code'] = sel.xpath(
             './/div[@class="sku"]/text()').extract()[0]
         item['price'] = price_normalize(
             sel.xpath(
                 './/div[@class="product-main-info"]//span[@class="price"]/text()'
             ).extract()[0])
         sizes = sel.xpath(
             './/label[@class="amconf-color-container amconf-noimage-div"]/text()'
         ).extract()
         item['sizes'] = sizes
         item['other'] = None
         item['image_urls'] = sel.xpath(
             './/a[contains(@data-image,"product/cache")]/@data-zoom-image'
         ).extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #14

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'benditopie'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/h1[@itemprop="name"]/text()').extract()[0]
         item['description'] = html_text_normalize(
             sel.xpath(
                 './/div[@itemprop="description"]//span/text()').extract())
         item['code'] = ''
         item['price'] = price_normalize(
             sel.xpath('.//span[@itemprop="price"]/text()').extract()[0])
         size_labels = sel.xpath(
             './/select[@id="ProductSelect-product-template"]/option[not(contains(text(),"gotado"))]/text()'
         ).extract()
         item['sizes'] = [label.strip()[:2] for label in size_labels]
         image_urls = sel.xpath(
             './/ul[@id="ProductThumbs-product-template"]/li/a/@href'
         ).extract()
         item['image_urls'] = [url[2:] for url in image_urls]
         yield item
     else:
         print("-------------- OLD -------------")

Пример #15

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'viamo'
         item['breadcrumb'] = sel.xpath(
             './/ul[@class="breadcrumb"]/li//a/text()').extract()
         item['title'] = sel.xpath(
             './/div[contains(@class,"prodname")]/text()').extract()[0]
         item['description'] = html_text_normalize(
             sel.xpath(
                 './/div[@class="productDescription"]/text()').extract())
         item['code'] = None
         item['price'] = price_normalize(
             sel.xpath('.//strong[@class="skuBestPrice"]/text()').extract()
             [0])
         sizes = sel.xpath(
             './/label[contains(@class,"dimension-Talle") and not(contains(@class,"unavailable"))]/text()'
         ).extract()
         item['sizes'] = sizes
         item['other'] = None
         item['image_urls'] = sel.xpath(
             './/a[contains(@title,"Zoom")]/@zoom').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #16

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'ladystork'
         title = html_text_normalize(sel.xpath('.//div[@class="p-title-group"]/h2/text()').extract()[0])
         item['title'] = title
         item['breadcrumb'] = sel.xpath('.//ol/li/a/text()').extract()[2:]
         item['description'] = html_text_normalize(sel.xpath('.//div[@id="ctl00_HTMLContent_pnlDesc"]/div/p/text()').extract())
         item['code'] = None
         price = price_normalize(sel.xpath('.//strong[@class="p-price"]/text()').extract()[0])
         item['price'] = price
         sizes = sel.xpath('.//ul[@class="p-size-list"]/li[@class!="disabled"]/a/text()').extract()
         item['sizes'] = sizes
         item['other'] = None
         img_urls = sel.xpath('.//div[@class="slick p-thumbs-photo"]/div/img/@src').extract()
         item['image_urls'] = img_urls
         yield item
     else:
         print("-------------- OLD -------------")

Пример #17

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = sel.xpath(
             './/div[@class="prd-details"]//h2[@itemprop="brand"]/text()'
         ).extract()[0]
         item['breadcrumb'] = []  # TODO
         item['title'] = sel.xpath(
             './/div[@class="prd-details"]//h1[@class="prd-title"]/text()'
         ).extract()[0]
         item['description'] = html_text_normalize(
             sel.xpath(
                 './/div[@id="productDetails"]//div[contains(@class,"prd-information")]/text()'
             ).extract())
         item['code'] = sel.xpath(
             './/div[@id="detailSku"]/@data-sku').extract()[0]
         item['price'] = price_normalize(
             sel.xpath('.//span[@id="price_box"]/text()').extract()[0])
         jsonSizes = sel.xpath(
             './/div[@class="prd-details"]//ul[contains(@class,"shoe_size")]/li/@data-simple'
         ).extract()
         item['sizes'] = self.parseSize(jsonSizes)
         item['image_urls'] = sel.xpath(
             './/ul[@id="productMoreImagesList"]//li/@data-image-product'
         ).extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #18

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'justaosadia'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/span[@class="name"]/text()').extract()[0]
         description = html_text_normalize(
             sel.xpath('.//span[@class="desc"]//text()').extract())
         description += ' ' + html_text_normalize(
             sel.xpath('.//ul[@class="detail-list"]/li//text()').extract())
         item['description'] = description
         item['code'] = sel.xpath(
             './/span[@class="code"]/text()').extract()[0]
         item['price'] = price_normalize(
             sel.xpath(
                 './/span[@class="price"]/span[@itemprop="price"]/@content'
             ).extract()[0])
         sizes = sel.xpath('.//ul[@class="sizes-list"]/li/@title').extract()
         item['sizes'] = sizes
         item['image_urls'] = sel.xpath(
             './/ul[@class="thumbs"]/li/a/@href').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #19

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'honkytonk'
         item['breadcrumb'] = sel.xpath(
             './/a[@class="breadcrumb-crumb"]/text()').extract()
         item['title'] = sel.xpath(
             './/span[contains(@class,"product-name")]/text()').extract()[0]
         item['description'] = ''
         item['code'] = ''
         item['price'] = price_normalize(
             sel.xpath(
                 './/span[@class="price product-price js-price-display"]/@content'
             ).extract()[0])
         sizes = sel.xpath(
             './/div[contains(./label/text(),"talle")]/select/option/text()'
         ).extract()
         if len(sizes) == 0:
             sizes = sel.xpath(
                 './/a[contains(@class,"custom Size")]/span/@data-name'
             ).extract()
         item['sizes'] = list(set(sizes))
         img_urls = [
             url[2:] for url in sel.xpath(
                 './/div[@class="jTscroller scroller-thumbs"]/a/@href').
             extract()
         ]
         if len(img_urls) == 0:
             img_urls = [
                 url[2:]
                 for url in sel.xpath('.//a[@id="zoom"]/@href').extract()
             ]
         item['image_urls'] = img_urls
         yield item
     else:
         print("-------------- OLD -------------")

Пример #20

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'xl'
         item['breadcrumb'] = sel.xpath(
             './/li[@class="last" and @typeof="v:Breadcrumb"]/a/text()'
         ).extract()
         item['title'] = sel.xpath(
             './/div[contains(@class, "fn productName")]/text()').extract(
             )[0]
         description = sel.xpath(
             './/div[contains(@class, "productDescription")]/text()'
         ).extract()
         code = ''
         if len(description) > 0:
             for i, s in enumerate(description):
                 if 'Código:' in s:
                     start_of_code = s.index('Código:') + 8
                     code = s[start_of_code:]
                     del description[i]
         item['description'] = html_text_normalize(description)
         item['code'] = code
         item['price'] = price_normalize(response.meta['price'])
         sizes = sel.xpath(
             './/div[@class="talles isTalle"]/span[@class="stock"]/text()'
         ).extract()
         item['sizes'] = sizes
         img_urls = sel.xpath('.//div[@class="thumbs"]/img/@src').extract()
         item['image_urls'] = [
             'https://www.xlshop.com.ar/' + url for url in img_urls
         ]
         yield item
     else:
         print("-------------- OLD -------------")

Пример #21

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'sarkany'
         js_dict_variable = sel.xpath(
             '//script[contains(.,"categoryName")]/text()').extract()[0]
         # String like --> vtxctx = {skus:"23",.....,categoryName:"Fiesta",....}
         js_dict_variable = js_dict_variable[js_dict_variable.
                                             find('categoryName') + 14:]
         category = js_dict_variable[:js_dict_variable.find('"')]
         item['breadcrumb'] = [category]
         item['title'] = sel.xpath(
             './/div[contains(@class,"prodname")]/text()').extract()[0]
         item['description'] = html_text_normalize(
             sel.xpath(
                 './/div[@class="productDescription"]/text()').extract()[0])
         item['code'] = None
         price = sel.xpath(
             './/strong[@class="skuBestPrice"]/text()').extract()
         if len(price) > 0:
             price = price_normalize(price[0])
         else:
             price = 0
         item['price'] = price
         sizes = sel.xpath(
             './/label[contains(@class,"Talle") and not(contains(@class,"unavailable"))]/text()'
         ).extract()
         item['sizes'] = sizes
         item['other'] = None
         item['image_urls'] = sel.xpath(
             './/a[@id="botaoZoom"]/@rel').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #22

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'ancayco'
         item['breadcrumb'] = []
         item['title'] = sel.xpath('.//div[@class="name uppercase bold"]/text()').extract()[0]
         description = [(text if ('PRODUCTO' not in text and 'MERCADO' not in text) else '') for text in sel.xpath('.//div[@class="lfill top-1"]//text()').extract()]
         description = html_text_normalize(description)
         item['description'] = description
         item['code'] = sel.xpath('.//div[@class="lfill"]/text()').extract()[0].replace('Código ','')
         price = sel.xpath('.//span[@class="_totalContainer left-1"]//text()').extract()
         if len(price) > 0:
             price = price[0]
             item['price'] = price_normalize(price)
         else:
             item['price'] = 0
         sizes = []
         for size_div in self.browser.find_elements_by_xpath(self.size_div_path):
             for color_div in self.browser.find_elements_by_xpath(self.color_div_path):
                 self.click_element(size_div)
                 self.click_element(color_div)
                 time.sleep(1)
                 actual_size = size_div.text
                 source = self.browser.page_source
                 sel = Selector(text=source)
                 buy_button_style = sel.xpath(self.buy_button_path).extract()[0]
                 if not 'display: none;' in buy_button_style:
                     sizes.append(actual_size)
         item['sizes'] = list(set(sizes))
         item['image_urls'] = sel.xpath('.//div[@class="thumbnail"]/a/@href').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #23

0

Показать файл

Файл: 47street.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = '47street'
         item['breadcrumb'] = []
         item['title'] = sel.xpath('.//span[@class="base"]/text()').extract()[0]
         item['description'] = html_text_normalize(sel.xpath('.//div[@itemprop="description"]/text()').extract())
         item['code'] = sel.xpath('.//div[@itemprop="sku"]/text()').extract()[0].replace('SKU# ', '')
         item['price'] = price_normalize(sel.xpath('.//span[@class="price"]/text()').extract()[0])
         sizes = sel.xpath('.//div[@class="swatch-option text"]/text()').extract()
         item['sizes'] = sizes
         item['image_urls'] = sel.xpath('.//div[@class="imagen"]/img/@src').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #24

0

Показать файл

Файл: sibylvane.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'sibylvane'
         item['breadcrumb'] = []
         item['title'] = html_text_normalize(sel.xpath('.//h3[@class="light-blue"]/text()').extract())
         item['description'] = html_text_normalize(sel.xpath('.//div[@id="ctl00_HTMLContent_pnlDesc"]/p/text()').extract())
         item['code'] = sel.xpath('.//div[@class="ref"]/text()').extract()[0].replace('ref:', '').replace('\n','')
         item['price'] = price_normalize(sel.xpath('.//div[@id="ctl00_HTMLContent_pnlPrice"]/text()').extract()[1])
         sizes = sel.xpath('.//select[@id="ddlSizesPicker"]/option[not(@value="-1") and not(@disabled)]/text()').extract()
         item['sizes'] = sizes
         item['image_urls'] = sel.xpath('.//img[@id="imgProductGallery"]/@data-zoom-image').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #25

0

Показать файл

 def parse_item(self, response):
     print("------------- New Item ----------------")
     self.browser.get(response.url)
     time.sleep(1)
     source = self.browser.page_source
     sel = Selector(text=source)
     item = Item()
     item['created_at'] = datetime.now()
     item['url'] = response.url
     brand = sel.xpath(
         './/h6[@class="fb-product-cta__brand fb-stylised-caps"]/text()'
     ).extract()[0]
     item['brand'] = slugify(brand)
     item['breadcrumb'] = html_text_normalize(
         sel.xpath(
             './/b[@class="fb-masthead__breadcrumb__links"]//span[@itemprop="title"]/text()'
         ).extract())
     item['title'] = sel.xpath(
         './/h1[@class="fb-product-cta__title"]/text()').extract()[0]
     description = html_text_normalize(
         sel.xpath(
             './/table[@class="fb-product-information__specification__table"]//tr[contains(@class,"row-data")]//text()'
         ).extract())
     item['description'] = description
     item['code'] = sel.xpath(
         './/p[@class="fb-product-sets__product-code"]/text()').extract(
         )[0].replace('Código del producto:', '')
     item['price'] = price_normalize(
         sel.xpath(
             './/p[@class="fb-price" and contains(text(), "Contado")]/text()'
         ).extract()[0].replace('Contado', ''))
     sizes = sel.xpath(
         './/select[@class="fb-inline-dropdown__native-dropdown fsrVisible"]/option[@value!=""]/@value'
     ).extract()
     item['sizes'] = sizes_normalize(sizes)
     item['image_urls'] = [url[2:] for url in \
         sel.xpath('.//span[@class="fb-pp-gallery-list__link js-pp-zoom-link" and not(span/i[@class="icon-productGalleryMore"])]/@data-image-zoom').extract()]
     yield item

Пример #26

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'natacha'
         breadcrumb = sel.xpath('.//nav/ol/li/a/text()').extract()
         if len(breadcrumb) > 2:
             breadcrumb = breadcrumb[1:-1]
         else:
             breadcrumb = []
         item['breadcrumb'] = breadcrumb
         item['title'] = sel.xpath('.//h1[@class="title border"]/text()').extract()[0].replace('Natacha Zapato Mujer', '')
         description = sel.xpath('.//article[@id="tabDescription"]//text()').extract()
         description = html_text_normalize(description)
         generic_text_start = ' Productos confeccionados'
         if generic_text_start in description:
             description = description[:description.index(generic_text_start)]
         item['description'] = description
         item['code'] = ''
         price = sel.xpath('.//span[@class="ch-price price"]/text()').extract()[0]
         item['price'] = price_normalize(price)
         sizes = sel.xpath('.//div[@id="my-variation-1-container"]//menu[contains(@class,"ch-select-content")]/li/span[not(text()="Talle")]/text()').extract()
         if len(sizes) == 0:
             sizes = [sel.xpath('.//span[contains(text(),"Talle")]/text()').extract()[0].replace('Talle: ','')]
         item['sizes'] = [s for s in sizes if "Sin Stock" not in s] # Cuando no hay un talle queda asi [ "37 - Sin Stock", "39 - Sin Stock", "40 - Sin Stock", "41 - Sin Stock" ]
         img_urls = sel.xpath('.//ul[@class="ch-carousel-list"]/li/img/@src').extract()
         if len(img_urls) > 1:
             img_urls = img_urls[:-1] # Eliminate size table image
         item['image_urls'] = img_urls
         yield item
     else:
         print("-------------- OLD -------------")

Пример #27

0

Показать файл

Файл: martinasaban.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'martinasaban'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/h1[@class="name ng-binding"]/text()').extract()[0]
         description = html_text_normalize(
             sel.xpath('.//div[@class="tab-content"]//text()').extract())
         item['description'] = description
         item['code'] = ''
         item['price'] = price_normalize(
             sel.xpath('.//p[@class="price ng-binding"]/text()').extract()
             [0])
         sizes = []
         for size_span in self.browser.find_elements_by_xpath(
                 './/tag-option/a/span'):
             size_span.click()
             time.sleep(0.5)
             actual_size = size_span.text
             if 'Agregar' in sel.xpath(
                     './/button[@id="addItemMyCart"]//span/text()').extract(
                     )[0]:
                 sizes.append(actual_size)
         item['sizes'] = sizes
         item['image_urls'] = [url.replace('Square', 'Original') for url in \
                               sel.xpath('.//li[@class="carousel-item ng-scope"]/img/@src').extract()]
         yield item
     else:
         print("-------------- OLD -------------")

Пример #28

0

Показать файл

Файл: grimoldi.py Проект: EitanRosenzvaig/crawler

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(10)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'grimoldi'
         # Get first word of title i.e Abotinadas Berry
         item['breadcrumb'] = sel.xpath(
             './/title/text()').extract()[0].split(' ', 1)[0]
         item['title'] = sel.xpath(
             './/span[@id="Nombre"]/text()').extract()[0]
         description = html_text_normalize(
             sel.xpath('.//div[@class="description"]/p/text()').extract())
         # remove generic text
         description = description.split('. Compr', 1)[0]
         item['description'] = description
         item['code'] = None
         item['price'] = price_normalize(
             sel.xpath(
                 './/label[@id="PrecioSeleccionado"]/text()').extract()[0])
         sizes = sel.xpath(
             './/select[@id="IdMedidaSeleccionada"]/option/text()').extract(
             )
         item['sizes'] = sizes
         item['other'] = None
         urls = sel.xpath(
             './/div[@class="productImages"]//li/img/@data-image-url'
         ).extract()
         item['image_urls'] = [url[2:] for url in urls]
         yield item
     else:
         print("-------------- OLD -------------")

Пример #29

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'heyas'
         item['breadcrumb'] = []
         item['title'] = sel.xpath(
             './/div[@class="product-main-info text-center"]//h2/text()'
         ).extract()[0]
         description = html_text_normalize(
             sel.xpath('.//div[@class="description"]//text()').extract())
         item['description'] = description
         item['code'] = sel.xpath(
             './/div[@class="sku"]/text()').extract()[0].replace(
                 'SKU# ', '')
         price = sel.xpath(
             './/div[@class="product-main-info text-center"]//span[@class="price"]/text()'
         ).extract()
         if len(price) > 1:
             price = price[len(price) - 1]
         else:
             price = price[0]
         item['price'] = price_normalize(price)
         item['sizes'] = sel.xpath(
             './/div[@class="input-box"]//label[not(contains(@class,"no-stock"))]/text()'
         ).extract()
         item['image_urls'] = sel.xpath(
             './/div[@id="gallery_01"]//a/@data-zoom-image').extract()
         yield item
     else:
         print("-------------- OLD -------------")

Пример #30

0

Показать файл

 def parse_item(self, response):
     if self.links.find_one({"_id": response.url}) is None:
         print("------------- New Item ----------------")
         self.browser.get(response.url)
         time.sleep(2)
         source = self.browser.page_source
         sel = Selector(text=source)
         item = Item()
         item['created_at'] = datetime.now()
         item['url'] = response.url
         item['brand'] = 'brunomanetti'
         item['breadcrumb'] = []
         item['title'] = sel.xpath('.//span[@itemprop="name"]/text()').extract()[0]
         description = [sel.xpath('.//h3/strong/font/i/text()').extract()[0]]
         description += sel.xpath('.//p[contains(@style,"color: rgb(51, 51, 51); font-family: sans-serif, Arial, Verdana, ")]//text()').extract()
         if len(description) > 3:
             description = html_text_normalize(description[:len(description)-2])
         item['description'] = description
         item['code'] = ''
         price = sel.xpath('.//span[@id="price_display"]/text()').extract()[0]
         item['price'] = price_normalize(price)
         sizes = []
         for size_a in self.browser.find_elements_by_xpath(self.size_a_path):
             try:
                 tmp = size_a.click()
             except:
                 pass
             time.sleep(0.2)
             actual_size = size_a.text
             buy_button = self.browser.find_elements_by_xpath(self.buy_button_path)[0]
             if buy_button.is_enabled():
                 sizes.append(actual_size)
         item['sizes'] = sizes
         item['image_urls'] = sel.xpath('.//a[@class="cloud-zoom"]/@href').extract()[0][2:]
         yield item
     else:
         print("-------------- OLD -------------")

Python price_normalize примеры использования