def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//*[@id="area-2"]//div[@class="grid-25"]') if products: for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'div/h3/a/@href') if product.select('div/h3/a/abbr/@title'): loader.add_xpath('name', 'div/h3/a/abbr/@title') else: loader.add_xpath('name','div/h3/a/text()') price = product.select('div/div/p[@class="prd-amount"]/strong/text()').extract()[0] loader.add_value('price', self._encode_price(price)) yield loader.load_item() else: products = hxs.select('//*[@id="area-2"]//tr[@class="prd first"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'td/h3/a/@href') loader.add_xpath('name', 'td/h3/a/text()') if product.select('td/p/strong/text()').extract(): price = product.select('td/p/strong/text()').extract()[0] else: if product.select('td/div/p/strong/text()').extract(): price = product.select('td/div/p/strong/text()').extract()[0] loader.add_value('price', self._encode_price(price)) yield loader.load_item()
def parse_page(self, response): base_url = get_base_url(response) base_url_func = functools.partial(urljoin_rfc, base_url) hxs = HtmlXPathSelector(response) cats = hxs.select("//ul[@id='nav']//a/@href").extract() for url in cats: yield Request(urljoin_rfc(base_url, url), callback=self.parse_page) # next page hxs = HtmlXPathSelector(response) url = hxs.select("//div[@class='pagerLine']//a[@class='next']/@data-query").extract() if url: yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page) # products for z in hxs.select("//div[@class='products']//li"): #name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract() loader = ProductLoader(selector=z, item=Product()) loader.add_xpath('identifier', "@data-product-url", first, re="articleNumber=(\d+)") loader.add_xpath('sku', "@data-product-url", first, re="articleNumber=(\d+)") loader.add_xpath('url', "@data-product-url", first, base_url_func) loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/span[@class='brand']/text()") loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/a[starts-with(@class, 'name')]/text()") price = z.select(".//p[@class='price']/ins//text()") \ or z.select(".//p[@class='price']//text()") \ or z.select(".//p[@class='price']/del//text()") price = ''.join(price.extract()).replace(',', '.').replace(u'\xa0', '') loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) search_sku = response.meta['sku'] hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) name_xpaths = [u'//font[contains(@class,"productname")]/big/text()', u'//font[contains(@class,"productname")]/text()'] for name_xpath in name_xpaths: main_name = hxs.select(name_xpath).extract() if main_name: main_name = main_name[0].strip() break if not main_name: main_name = response.url main_name = re.search(u'.*/(.*)\.htm', main_name) if main_name: main_name = main_name.groups()[0] + u' (%s)' % search_sku options = hxs.select(u'//td//text()').re(u'PURCHASE OPTIONS: (.*)') if options: main_name += u' %s' % options[0].strip() loader.add_value('name', main_name) loader.add_xpath('price', u'//td//font[contains(@class,"pricecolor") and not(ancestor::table[contains(@id,"related")])]/text()') loader.add_value('sku', search_sku) sku = hxs.select(u'//span[@class="product_code"]/text()').extract() if sku: sku = re.sub('-', '', sku[0]) if sku.startswith(search_sku): yield loader.load_item()
def parse_products(self, hxs, response): print response.encoding model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Model"]/preceding-sibling::*) + 1').extract() description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Description"]/preceding-sibling::*) + 1').extract() price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' + ' and text()="Price"]/preceding-sibling::*) + 1').extract() if model_pos and description_pos and price_pos: model_pos = model_pos[0].split('.')[0] description_pos = description_pos[0].split('.')[0] price_pos = price_pos[0].split('.')[0] products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \ and not(text()="Model")]/..' % model_pos) for product in products: loader = ProductLoader(selector=product, item=Product()) url = response.url model_url = product.select('.//td[starts-with(@class, "orderinfo") \ and position()=%s]//a/@href' % model_pos).extract() if model_url: url = urljoin_rfc(get_base_url(response), model_url[0]) loader.add_value('url', url) loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos) loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos) if not loader.get_output_value('price') or not loader.get_output_value('name').strip(): continue yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//div[@class="prelement"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//a/text()') price = item.select(u'.//p[@class="prpri"]/text()').extract()[0] price = price.strip().lstrip('Pris: DKK ').replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//ul[@id="pMenul0"]/../' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//td[@class="Description_ProductList"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//a/@title') price = item.select(u'../..//span[@class="Price_Productlist"]/text()').extract()[0] price = price.strip().rstrip(' DKK').replace('.', '').replace(',', '.') if price == u'Ring for pris!': price = 0 product_loader.add_value('price', price) url = item.select(u'.//a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//table[@id="ProductMenu_Table"]/../' + u'/'.join([u'table/tr/td'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse_page(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="item"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'h2/a/text()') relative_url = product.select('h2/a/@href').extract()[0] url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) loader.add_value('url', url) loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()') yield loader.load_item() next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract() if not next_page: relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract() for relative_url in relative_urls: url = urljoin_rfc('http://www.dolphinmusic.co.uk/', relative_url, response.encoding) yield Request(url, callback=self.parse_page) else: next_url = next_page[-1] if self._is_next(next_url): url = urljoin_rfc('http://www.dolphinmusic.co.uk/', next_url, response.encoding) yield Request(url, callback=self.parse_page)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip() multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option') if multiple_options and not u'requested' in response.meta: for option in multiple_options: formname = u'aspNetForm' formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0], u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList', u'__EVENTARGUMENT' : u''} req = FormRequest.from_response(response, formname=formname, formdata=formdata, meta={u'requested': True}, dont_click=True, callback=self.parse_product) yield req if multiple_options: name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse_pagination(self, response): URL_BASE = 'http://www.dv247.com/' hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="listItem clearfix"]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = ''.join(product.select('.//a//text()').extract()) loader.add_value('name', name) relative_url = product.select('.//a/@href')[0].extract() url = urljoin_rfc(URL_BASE, relative_url) loader.add_value('url', url) loader.add_xpath('price', './/li[@class="price"]/text()') yield loader.load_item() #next page next_pages = hxs.select('//div[@class="listPaging"]') next_ten = [] if next_pages: next_ten = next_pages[0].select('.//a[text()="Next 10"]/@href').extract() if next_pages: next_pages = next_pages[0].select('.//a[not(@class="selectedpage") and not(text()="Next 10") and not(text()="Previous 10")]/@href').extract() for page in next_pages: url = urljoin_rfc(URL_BASE, page) yield Request(url, callback=self.parse_pagination) if next_ten: next_ten_url = urljoin_rfc(URL_BASE, next_ten[0]) yield Request(next_ten_url, callback=self.parse_pagination)
def parse_products(self, response): hxs = HtmlXPathSelector(response) nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href') if nextPageLink: yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products) products = hxs.select('//div[@id="center-main"]//div[@class="details"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath("name", "a/text()") loader.add_xpath("sku", 'div[@class="sku"]/span/text()') # few prices were under div class desc price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()') if price_selector: price = price_selector[0].extract() else: price = "0.0" loader.add_value("price", price) relative_url = product.select("a/@href")[0].extract() loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url)) yield loader.load_item()
def parse(self, response): if response.url in self.junk_urls: return hxs = HtmlXPathSelector(response) for item in hxs.select(u'//div[@class="item_wrapper"]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//div[@class="name"]/a/text()') price = item.select(u'.//div[@class="price"]/text()[last()]').extract()[0] price = price.strip().lstrip('Kr. ').replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//div[@class="name"]/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) yield product_loader.load_item() level = response.meta.get('level', 1) sub_url = u'//div[@id="shopnav"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1})
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//*[@id="products-list"]/li') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'div[@class="product-details left"]/h2/a/text()') price = product.select('div[@class="product-shop left"]/div/div/p/span/span/text()') if price: price = price[0] else: price = product.select('div[@class="product-shop left"]/div/div/span/text()') if price: price = price[0] else: price = product.select('div[@class="product-shop left"]/div/div/p/span/text()') if len(price)==1: price = price[0] else: price = price[1] loader.add_value('price', price) loader.add_xpath('url', 'div[@class="product-details left"]/h2/a/@href') yield loader.load_item() next = hxs.select('//div[@class="right-nav right"]/a/@href').extract() if next: url = next[0] yield Request(url, callback=self.parse_products)
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None search_results = [] for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3/a/span/text()') if not loader.get_output_value('name'): loader.add_xpath('name', './/h3/a/text()') loader.add_xpath('url', './/h3/a/@href') loader.add_xpath('price', './/ul/li/a/span/text()', re='\$(.*)') if not loader.get_output_value('price'): loader.add_xpath('price', './/div[@class="newPrice"]//span[contains(@class,"price")]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader search_results.append(pr) # if pr: # yield pr.load_item() if search_results: cur_prod = search_results[0] next_prods = search_results[1:] yield Request(cur_prod.get_output_value('url'), callback=self.parse_mfrgids, meta={'mfrgid': response.meta['mfrgid'], 'name': response.meta['name'], 'cur_prod':cur_prod, 'next_prods':next_prods}, dont_filter=True)
def parse(self, response): hxs = HtmlXPathSelector(response) for item in hxs.select(u'//tr[contains(@class,"product-item")]'): product_loader = ProductLoader(item=Product(), selector=item) product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()') price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0] price = price.strip().replace('.', '').replace(',', '.') product_loader.add_value('price', price) url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) product_loader.add_value('url', url) # If quantity field is not present on page, there are subproducts qty = item.select(u'.//input[@name="products_qty"]').extract() if qty: yield product_loader.load_item() else: yield Request(url, callback=self.parse_sub) level = response.meta.get('level', 1) sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href' subcategories = hxs.select(sub_url).extract() for subcategory in subcategories: url = urljoin_rfc(get_base_url(response), subcategory) yield Request(url, meta={'level': level+1}) next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract() if next_url: next_url = urljoin_rfc(get_base_url(response), next_url[0]) yield Request(next_url, meta={'level': level})
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract() if name: name = name[0].strip() url = response.url url = urljoin_rfc(get_base_url(response), url) items = hxs.select('//div[@class="Item"]') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_value('url', url) #loader.add_value('name', name[0]) sku = ''.join(item.select('./text()').extract()) n = name if sku: n += ' ' + sku.strip() loader.add_value('name', n) loader.add_xpath('price', './/span[@class="price"]/text()') loader.add_xpath('price', './div[@class="price"]/span/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//span[@itemprop="price"]/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) mpn = ''.join(hxs.select('//*[@id="product-information"]/table/tr[th/text()="Part number"]/td/span/text()').extract()).strip() loader.add_value('identifier', mpn) loader.add_value('name', ' '.join((response.meta['name'].strip(), mpn))) loader.add_value('url', response.url) loader.add_xpath('price', '//*[@id="product-price"]/p[@class="no-vat"]/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_xpath('name', '//h1[@id="product_description"]/text()') product_loader.add_value('price', hxs.select('//p[@id="product_price"]/span/text()').re('(\d+(?:\.\d+))')[0]) product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('url', response.url) yield product_loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//h3[@class="product_name"]/../..') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3[@class="product_name"]/a/text()') url = product.select('.//h3[@class="product_name"]/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) loader.add_value('url', url) loader.add_xpath('price', './/p[@class="price"]/text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//*[@id="header"]/text()') loader.add_value('url', response.url) price = ''.join(hxs.select('//*[@id="productdesc"]/font/font/text()').extract()).replace('.','').replace(',','.') if price: price = price.split(':')[-1] loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@id="productDetail"]//h1[@class="productDetailTitle"]/text()') loader.add_xpath('price', '//div[@id="productDetail"]//span[contains(@class,"price")]/text()') sku = hxs.select('//div[@id="productDetail"]//p[1]')[0].re('Ref\. Code: (\d+)') loader.add_value('sku', sku) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//div[@id="prodTITLE"]//h1/text()') product_loader.add_xpath('price', '//div[@id="prodDETAILS"]//span[@class="price"]/text()') product_loader.add_value('sku', response.meta['sku']) product_loader.add_value('url', response.url) yield product_loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) if self.products.has_key(response.url): sku = self.products[response.url] loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//*[@id="feature_content_info"]/h1/text()') loader.add_xpath('price', '//*[@id="productBuy"]/p/span/text()') return loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//*[@itemprop="price"]/text()') loader.add_value('url', response.url) yield loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//table[@class="SearchGrid"]//td/a[contains(@href, "productdetail.aspx")]/../..') for product in products: loader = ProductLoader(item=Product(), selector=product) url = product.select('.//a[contains(@href, "productdetail.aspx")]/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) loader.add_value('url', url) loader.add_xpath('name', './/td[position() = 2]//a[contains(@href, "productdetail.aspx")]/text()') loader.add_xpath('price', './/td[position() = 3]//text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_xpath('name', '//h1[@id="partNameId"]/text()') loader.add_value('url', response.url) loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()') sku = ''.join(hxs.select('//b[contains(text(), "Model #:")]/../text()').extract()).strip() loader.add_value('sku', sku) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) name = hxs.select('//span[@id="ProductDetail1_lblDescription"]//text()').extract() if name: loader = ProductLoader(item=Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_xpath('price', '//*[@class="yourPriceText"]//text()') loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//h1[@itemprop="name"]/text()') price = hxs.select(u'//span[@itemprop="price"]/text()').extract()[0].replace(',', '.') loader.add_value('price', price) loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//div[@class="buying"]/h1[@class="parseasinTitle"]/span[@id="btAsinTitle"]/text()') price = hxs.select(u'//div[@class="buying"]/table[@class="product"]//b[@class="priceLarge"]/text()').extract()[0] loader.add_value('price', price.replace(',', '.')) loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_products(self, hxs, response): base_url = get_base_url(response) for product in hxs.select('//div[@id="titre_pdt"]/..'): loader = ProductLoader(selector=product, item=Product()) loader.add_xpath('name', './/h2/text()') url = product.select('.//div[@id="img_pdt"]/a/@href').extract()[0] url = urljoin_rfc(base_url, url) loader.add_value('url', url) price = u''.join(product.select(".//a[@class='prix_normal']//text()").extract()) loader.add_value('price', price) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) row = response.meta['row'] loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['PRODUCT_NUMBER']) loader.add_value('sku', row['PRODUCT_NUMBER']) loader.add_xpath('brand', '//span[@itemprop="brand"]/text()') categories = hxs.select( '//a[@class="breadcrumb"]/text()').extract()[1:] loader.add_value('category', categories) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//span[@id="price_container"]/text()') loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@id="product_image"]/@src') in_stock = hxs.select( '//link[@itemprop="availability" and contains(@href, "InStock")]') if not in_stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_products(self, hxs, response): products = hxs.select('//div[starts-with(@id, "productData-")]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/a[@class="pagedLink"]/text()') loader.add_xpath('price', './/div[@class="pagedPriceSale"]/text()') url = product.select('.//a[@class="pagedLink"]/@href').extract()[0] loader.add_value('url', urljoin_rfc(get_base_url(response), url)) sku = self._get_sku(url) loader.add_value('sku', sku) yield loader.load_item() products = hxs.select('//table[@id="multi"]//td[@id="multi-product3"]/..') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/td[position() = 2]/a/text()') url = product.select('.//td[position() = 2]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(get_base_url(response), url)) loader.add_value('sku', self._get_sku(url)) loader.add_xpath('price', './/td[@id="multi-price2"]//text()') yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) row = response.meta['row'] loader = ProductLoader(response=response, item=Product()) loader.add_value('identifier', row['PRODUCT_NUMBER']) loader.add_value('sku', row['PRODUCT_NUMBER']) loader.add_xpath('brand', '//span[@class="brand"]/text()') categories = hxs.select('//ul[@class="crumb"]/li/a/text()').extract() loader.add_value('category', categories) loader.add_xpath('name', '//h2[@itemprop="name"]/text()') loader.add_xpath('price', '//span[@class="list_price"]/text()') loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@id="imyimage"]/@src') out_of_stock = hxs.select( '//meta[@itemprop="availability" and contains(@content, "OutOfStock")]' ) if out_of_stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@class="prod_name"]/text()') in_stock = 'EN STOCK' in ''.join( hxs.select('//span[contains(@class, "prod_stock_text")]/text()'). extract()).upper() if not in_stock: loader.add_value('stock', 0) for category in response.meta['categories']: loader.add_value('category', category) loader.add_xpath('brand', '//li[span[contains(text(), "Fabricante")]]/text()') loader.add_value('shipping_cost', 6.99) loader.add_xpath( 'sku', u'//li[span[contains(text(), "Cod. Artículo")]]/text()') identifier = hxs.select( '//input[@id="JS_google_remarketing__prodid"]/@value').extract() loader.add_value('identifier', identifier) image_url = hxs.select('//ul[@class="etalage"]//img/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) product = loader.load_item() meta = response.meta.copy() meta['product'] = product cart_url = 'http://www.mytelecom.es/es/ajax_cart_update/only_minicart:true' data = { 'product_id': product['identifier'], 'quantity': '1', 'view_minicart': 'front_cart/v_modal_add_cart' } yield FormRequest(cart_url, formdata=data, headers={ 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest' }, dont_filter=True, meta=meta, callback=self.parse_price)
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//table[@id="tblParts"]//tr[@onmouseout]') if product: loader = ProductLoader(item=Product(), selector=product[0]) loader.add_xpath('name', './td[3]/div/a//text()') url = product.select('./td[3]/div/a/@href').extract()[0] url = urljoin_rfc(get_base_url(response), url) loader.add_value('url', url) loader.add_xpath('price', './td[6]//text()', re='\$(.*)') loader.add_xpath('sku', './td[3]/div/a[2]/text()') if loader.get_output_value('sku') == response.meta['sku'].lower(): yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) product = hxs.select('//div[@class="single_product clearfix"]') cnt = 1 if not product: product = hxs.select('//div[@class="ThumbContainer1"]') cnt = 2 if not product: return pr = None if cnt>1: for prd in product: loader = ProductLoader(item=Product(), selector=prd) url = prd.select('.//div[@class="ThumbImage"]//a/@href').extract()[0] if response.meta['sku'] in url: loader.add_value('url', url) loader.add_xpath('name', './/span[@class="underline"]/strong/text()') nm = loader.get_output_value('name') if not nm: loader.add_xpath('name', './/span[@class="underline"]/strong/i/text()') loader.add_value('sku', response.meta['sku']) loader.add_xpath('price', './/span[@class="Label1"]/text()') pr = loader else: continue else: loader = ProductLoader(item=Product(), selector=product) url = product.select('.//div[@class="product_image"]/a/@href').extract()[0] if response.meta['sku'] in url: loader.add_value('url', url) loader.add_xpath('name', './/li[@class="product_title"]/text()') loader.add_value('sku', response.meta['sku']) price = product.select('.//div[@class="price"]/text()').extract()[1] loader.add_value('price', price) pr = loader if pr: yield pr.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) identifier = hxs.select( '//input[@id="product_page_product_id"]/@value').extract() identifier = identifier[0] if identifier else response.url.split( '/')[-1].split('-')[0] l = ProductLoader(item=Product(), response=response) l.add_value('identifier', identifier) l.add_xpath('name', '//div[@id="product_title"]/h1/text()') l.add_value('category', response.meta['category']) l.add_xpath('brand', '//a[@class="brand_image"]/@title') l.add_xpath('sku', '//h2[@id="product_reference"]/span/text()') l.add_value('url', response.url) price = hxs.select('//span[@id="our_price_display"]/text()').extract() if price: price = ''.join(price[0].replace(',', '.').split()) else: price = 0 #price = ''.join(price[0].strip().split()).replace(',','.') if price else 0 l.add_value('price', price) l.add_xpath('image_url', '//div[@id="image-block"]/img/@src') yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//li[@class="item first"]') for product in products: name = products.select( 'h2[@class="product-name"]/a/text()').extract()[0].lower() if name == response.meta['name']: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'h2[@class="product-name"]/a/text()') loader.add_xpath('url', 'h2[@class="product-name"]/a/@href') loader.add_xpath( 'price', 'div[@class="price-box"]/span[@class="price-excluding-tax"]/span[@class="price"]/text()' ) loader.add_value('sku', response.meta['sku']) yield loader.load_item()
def parse_node(self, response, node): if not isinstance(response, XmlResponse): return identifier = node.select(u'./product-url/text()').re( r'product/([^/]+)/') identifier = identifier[0] loader = ProductLoader(item=Product(), selector=node) url = node.select(u'./product-url/text()').extract()[0] loader.add_value('url', url) loader.add_xpath('name', u'./title/text()') price = node.select(u'./price/text()').extract()[0].replace(',', '.') loader.add_value('price', price) loader.add_xpath('category', u'merchant-category/text()') loader.add_xpath('brand', u'brand/text()') loader.add_xpath('image_url', u'image-url/text()') loader.add_value('sku', identifier) loader.add_value('identifier', identifier) if loader.get_output_value('price'): return loader.load_item() else: return Product()
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_value('identifier', response.url.split('-')[-1].split('.')[0]) loader.add_xpath('name', '//meta[@itemprop="name"]/@content') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') categories = response.xpath( '//div[@id="prodBreadCrumbs"]//a/text()').extract() for cat in categories: loader.add_value('category', cat) if loader.get_output_value('price') is None: delisted_text = response.xpath( '//span[@class="markup-blu markup-lg"]/text()') if delisted_text and 'discontinued' in delisted_text.extract( )[0].lower(): self.log('delisted product {}'.format(response.url)) return loader.add_value('stock', 0) loader.add_value('price', 0) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('url', response.url) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) identifier = hxs.select('//input[@name="product"]/@value').extract() if not identifier: identifier = re.search(r'product=(.*)', response.url).groups()[0] loader.add_value('identifier', identifier) loader.add_xpath('name', '//*[@id="header"]/text()') loader.add_xpath('brand', '//a[@class="hilight"]/text()') loader.add_value('url', response.url) price = hxs.select('//*[@id="productdata"]//span[@class="price"]/text()').extract() if not price: price = hxs.select('//*[@id="productdesc"]//font/text()').re(u"Værdi: (.+)") if not price: price = hxs.select('//*[@id="productdesc"]//font/text()').re(u"Vejl. udsalgspris: (.+)") price = ''.join(price).replace('.', '').replace(',', '.') loader.add_value('price', price) loader.add_xpath('sku', '//input[@name="product"]/@value') loader.add_xpath('category', u'//span[@id="productnavgroup"]/a[1]/text()') img = hxs.select(u'//img[@id="productimg"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) sku = hxs.select('//table[@id="product-attribute-specs-table"]' '//th[@class="label" and contains(text(), "SKU")]' '/following-sibling::*/text()').extract() if not sku: sku = hxs.select('//table[@id="product-attribute-specs-table"]' '//th[@class="label" and contains(text(), "Barcode")]' '/following-sibling::*/text()').re(r'(\d\d\d\d\d)\d$') loader = ProductLoader(response=response, item=Product()) loader.add_xpath('price', '//div[@class="price-box"]/span[@class="price-excluding-tax"]/span[@class="price"]/text()') loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('sku', sku) loader.add_value('brand', 'Draper') loader.add_value('url', urljoin_rfc(base_url, response.url)) loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') loader.add_xpath('image_url', '//div[@class="product-img-box"]//img[@id="image"]/@src') loader.add_value('category', response.meta['category']) yield loader.load_item()
def parse_cat(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) product_divs = hxs.select('//div[@class="product-info"]') for product in product_divs: url = product.select( './/a[@class="product-title"]/@href').extract()[0] loader = ProductLoader(item=Product(), selector=product) loader.add_xpath( 'identifier', 'preceding-sibling::input[contains(@name, "product_id")]/@value' ) loader.add_xpath('name', './/a[@class="product-title"]/text()') loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_xpath('price', './/span[@class="price"]/span[@id]/text()') loader.add_xpath( 'sku', './/p[@class="sku"]//span[contains(@id,"product_code")]/text()' ) yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) # Is this a product with many option/attributes? options = hxs.select('//select[@name="selection[]"]/option') if options: log.msg('Found %d options for this product' % len(options)) url = urljoin_rfc(base_url, 'cmsplus/store-stockcheck.php') # Post these parameters prodid = hxs.select('//input[@name="prodid"]/@value').extract()[0] versionids = hxs.select( '//input[@name="verids"]/@value').extract()[0] presel = hxs.select('//input[@name="presel"]/@value').extract()[0] curtime = "%s" % int(round(time.time() * 1000)) name = hxs.select('//h1/text()').extract()[0] sku = hxs.select('//input[@name="prodid"]/@value').extract()[0] for option in options: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) #loader.add_xpath('price', '//span[@id="ourprice"]/text()') sku_sub = option.select('@value').extract()[0] loader.add_value('sku', '%s_%s' % (sku, sku_sub)) loader.add_value( 'name', '%s Type: %s' % (name, option.select('text()').extract()[0])) request = FormRequest(url=url, formdata={ 'prodid': prodid, 'versionids': versionids, 'presel': presel, 'var[]': sku_sub, 'firstrun': '1', 'curtime': curtime }, callback=self.parse_product_price) request.meta['item'] = loader yield request else: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) loader.add_xpath('sku', '//input[@name="prodid"]/@value') loader.add_xpath('name', '//h1/text()') loader.add_xpath('price', '//span[@id="ourprice"]/text()') yield loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select('//span[@class="ArTit"]//text()').extract()[0] name = " ".join(name.split()) loader.add_value('name', name) loader.add_xpath( 'sku', '//span[@id="MainContent_ngpArticolo_lblARCd_AR"]/text()') price = hxs.select( '//span[@id="MainContent_ngpArticolo_lblPrezzoScontato"]/text()' )[0].extract() price = price.replace('.', '').replace(',', '.') loader.add_value('price', price) loader.add_xpath( 'brand', '//span[@id="MainContent_ngpArticolo_lblARMarcaDescrizione"]/text()' ) loader.add_xpath( 'category', '//span[@id="MainContent_ngpArticolo_lblCd_ARGruppo2"]/text()') image_url = hxs.select('//div[@id="gallery"]/img/@src') if not image_url: image_url = hxs.select('//div[@id="gallery"]/input/@src') image_url = image_url[0].extract() if not image_url.strip().endswith('noimage.png'): loader.add_value('image_url', urljoin_rfc(base_url, image_url)) if hxs.select('//div[@class="art-light-red"]'): loader.add_value('stock', 0) loader.add_value('url', response.url) loader.add_value('identifier', response.url.split('id=')[1]) price = extract_price(price) if price < Decimal(100): loader.add_value('shipping_cost', '15.00') elif price < Decimal(251): loader.add_value('shipping_cost', '30.00') elif price < Decimal(751): loader.add_value('shipping_cost', '40.00') elif price < Decimal(1000): loader.add_value('shipping_cost', '60.00') else: loader.add_value('shipping_cost', '100.00') yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) options = hxs.select(u'//script').re('Product\.Bundle\((.*)\)') if options: options = json.loads(options[0]) mandatory_options = hxs.select(u'//div[@class="input-box"]//input[@type="hidden"]') name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0].strip() price = Decimal(0.0) exclude = set() for mandatory_option in mandatory_options: option = mandatory_option.select(u'./@name').re('bundle_option\[(.*)\]')[0] selection = mandatory_option.select(u'./@value').extract()[0] option = options['options'][option]['selections'][selection] name += u' %s' % option['name'].strip() price += Decimal(option['price']).quantize(Decimal('0.01')) exclude.add(mandatory_option) option_keys = set(options['options'].keys()).difference(exclude) for option in option_keys: selection_keys = options['options'][option]['selections'].keys() for selection in selection_keys: selection_name = options['options'][option]['selections'][selection]['name'] selection_price = options['options'][option]['selections'][selection]['price'] selection_price = Decimal(selection_price).quantize(Decimal('0.01')) loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name + u' %s' % selection_name.strip()) loader.add_value('price', price + selection_price) if loader.get_output_value('price'): yield loader.load_item() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', u'//div[@class="product-name"]/h1/text()') loader.add_xpath('price', u'//span[@class="regular-price"]/span[@class="price"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//div[@class="price-box"]//p[@class="minimal-price" or @class="price-from"]/span[@class="price"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=response.meta['product'], selector=hxs) loader.add_xpath('identifier', '//input[@name="product"]/@value') loader.add_value('url', response.url) loader.add_xpath('name', '//h1/*[@itemprop="name"]//text()') loader.add_xpath( 'sku', '//tr/th[contains(text(), "Model Number")]/../td/text()') loader.add_value('category', response.meta.get('category')) img = hxs.select('//img[@id="product-image-zoom-img"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_xpath( 'brand', '//tr/th[contains(text(), "Brand Name")]/../td/text()') brand = loader.get_output_value('brand').strip().upper() if brand in self.ignore_brands: log.msg('Ignoring %s product: %s' % (brand, response.url)) return yield self.add_shipping_cost(loader.load_item())
def parse_product(self, response): hxs = HtmlXPathSelector(response) # skip error page if hxs.select('//div[@class="portlet-msg-error"]'): self.log('[WARNING] Error page when loading url: %s' % response.url) return if not hxs.select( '//ul[contains(@class, "breadcrumbs")]/li/span/a/text()' ).extract(): # retry yield self.retry(response, "Error getting category from: %s" % response.url) return l = ProductLoader(item=Product(), response=response) l.add_xpath('name', '//div[contains(@class, "description")]/h1/text()') l.add_value('url', response.url) sku = hxs.select( '//div[contains(@class, "description-panel")]/span[contains(text(), "Ref. ")]/text()' ).extract() sku = sku[0].strip().replace('Ref. ', '') if sku else '' l.add_value('sku', sku) l.add_value('identifier', sku) l.add_value('brand', '') l.add_xpath('image_url', '//img[@id="current-zoomed"]/@src') category = hxs.select( '//ul[contains(@class, "breadcrumbs")]/li/span/a/text()').extract( )[-1] l.add_value('category', category) price = hxs.select( '//div[@class="price"]/span[@class="amount"]/text()').extract() price = price[0].replace('.', '').replace(',', '.') if price else 0 l.add_xpath('price', price) item = l.load_item() add_button = hxs.select( '//div[@class="add-section"]/a[contains(@class, "btn-green")]') if add_button: formdata = {'product': sku, 'quantity': '1'} product = {'item': item, 'formdata': formdata} self.collect_products.append(product) else: yield item
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = hxs.select('//a[@itemprop="url"]/@href').extract() if options: for url in options: yield Request(response.urljoin(url), callback=self.parse_product) return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') categories = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/span/text()').extract()[6:] for category in categories: if category not in loader.get_output_value('name'): loader.add_value('name', category) loader.add_xpath('identifier', '//meta[@itemprop="productID"]/@content') loader.add_xpath('price', '//span[@itemprop="price"]/text()') loader.add_css('price', '.price ::text') loader.add_value('url', response.url) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') if loader.get_output_value('identifier'): yield loader.load_item()
def parse_products(self, hxs, response): products = hxs.select( '//table[@class="v65-productDisplay"]//a[contains(@class, "productnamecolor")]/..' ) if not products: products = hxs.select( '//a[contains(@class, "productnamecolor")]/../..') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath( 'url', './/a[contains(@class, "productnamecolor")]/@href') loader.add_xpath( 'name', './/a[contains(@class, "productnamecolor")]/text()') loader.add_xpath( 'price', './/font[contains(@class, "colors_productprice")]/text()') yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) if not self.brand_crawled: brands = hxs.select( '//*[@class="infoBox-categories"]//a/@href').extract() for url in brands: if not re.search('^http', url): url = urljoin_rfc(base_url, url) yield Request(url, callback=self.parse_products) self.brand_crawled = True # Is it another subcategory page? sub_sub_categories = hxs.select( '//div[@id="catView"]//a/@href').extract() for url in sub_sub_categories: if not re.search('^http', url): url = urljoin_rfc(base_url, url) yield Request(url, callback=self.parse_products) # Is it products page? products = hxs.select( '//div[@id="productView"]/ul/li[@class="product"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h2/a/text()') if product.select( './/h3/a/span[@class="productSpecialPrice"]/text()'): loader.add_xpath( 'price', './/h3/a/span[@class="productSpecialPrice"]/text()') else: loader.add_xpath('price', './/h3/a/text()') loader.add_xpath('url', './/h2/a/@href') yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//table[@class="grid"]/tr/td') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', 'table/tr/td/div[@class="ttl g-std"]/a/@title') loader.add_xpath('url', 'table/tr/td/div[@class="ttl g-std"]/a/@href') loader.add_xpath( 'price', 'table/tr/td/div/table/tr/td/span[@itemprop="price"]/text()') yield loader.load_item() next = hxs.select('//td[@class="next"]/a/@href').extract() if next: url = urljoin_rfc(get_base_url(response), next[0]) yield Request(url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) loader.add_xpath('identifier', '//div[@id="dv_ref"]/@title') loader.add_xpath('sku', '//div[@id="dv_ref"]/@title') price = hxs.select( '//span[contains(@class, "prix")]/@data-prix-origine').extract() if not price: price = hxs.select( '//div[@class="fa-infos-prix"]/div//span[contains(@class, "prix")]/text()' ).extract() price = price[0] if price else 0 loader.add_value('price', price) loader.add_xpath('name', '//h1[@itemprop="name"]//text()') categories = hxs.select( '//div[@class="breadcrumb"]/span/a/span[@itemprop="title"]/text()' ).extract()[:-1] loader.add_value('category', categories) img = ''.join(hxs.select('//img[@itemprop="image"]/@src').extract()) if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img)) loader.add_value('brand', response.meta.get('brand')) in_stock = hxs.select( '//div[contains(@class, "text-dispo") and contains(text(), "En stock")]' ) if in_stock: loader.add_value('stock', '1') else: loader.add_value('stock', '0') product = loader.load_item() metadata = KeterMeta() metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product for x in self.parse_review(response): yield x
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', './/h1[@itemprop="name"]/text()') loader.add_value('url', response.url) loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath('image_url', '//div[@class="product"]//img[@id="product-image"]/@src') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('brand', 'Poundstretcher') for category in hxs.select('//li[contains(@itemtype,"Breadcrumb")]/a/span/text()')[1:].extract(): loader.add_value('category', category) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', '//span[@class="testo16-titolo"]/text()') loader.add_xpath('sku', '//span[@id="shownIntxt"]/text()') price = hxs.select('//span[@class="titolo-cat"]/text()') or 0 if price: price = price.extract()[0].split()[1] if price==0: log.msg('ERROR: No price url: ' + response.url) return loader.add_value('price', price) category = hxs.select('//div[@class="testo11-nero"]/strong/text()').extract()[-1] loader.add_value('category', category) loader.add_xpath('image_url', '//a[@id="foto_visualizzata_link"]/img/@src') loader.add_value('identifier', hxs.select('//meta[@property="og:url"]/@content').extract()[0].split('/')[4]) loader.add_value('url', response.url) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('name', './/h3[@class="title"]/a/text()') loader.add_xpath('url', './/h3[@class="title"]/a/@href') loader.add_xpath('price', './/td[@class="toeOurPrice"]/a/text()') loader.add_value('sku', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//a[@class="prod-box"]') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath( 'name', 'span[@class="prod-desc-area"]/' 'span[@class="prod-name-row"]/strong/text()') loader.add_xpath('url', '@href') loader.add_xpath( 'price', 'span[@class="prod-desc-area"]/' 'span[@class="price-prod"]/text()') yield loader.load_item() next_page = hxs.select('//*[@id="sli_pagination_footer"]/' 'span/a[text()="Next"]/@href').extract() if next_page: next_url = next_page[-1] yield Request(next_url, callback=self.parse)
def parse_product(self, response): hxs = HtmlXPathSelector(response) prescription = response.xpath( '//ul[@itemprop="description"]/li[contains(text(), "Prescription required")]' ) if not prescription: prescription = response.xpath( '//strong[text()="Prescription only"]') if not prescription: loader = ProductLoader(item=response.meta['product'], selector=hxs) identifier = response.css( '#MainProduct_Product_ProductGUID ::attr(value)').extract( ) or response.xpath( '//span[@class="hdnProductGuid"]/text()').extract() if not identifier: return loader.add_value('identifier', identifier) loader.add_xpath('url', '//link[@rel="canonical"]/@href') loader.add_value('url', response.url) loader.add_xpath('name', '//h1/text()') loader.add_xpath('sku', '//label[@class="prodCodeSize"]/b/text()') category = response.css('.breadcrumb').xpath( '//span[@itemprop="title"]/text()').extract()[1:-1] loader.add_value('category', category) img = response.css('.slider-main img::attr(src)').extract( ) or response.css('.img-responsive ::attr(src)').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) brand = ''.join( response.xpath('//a[@class="abrandPLink"]//text()').extract()) brand = brand.replace('See More ', '').replace(' Products', '').strip() if not brand: brand = loader.get_output_value('name').split()[0] if brand: loader.add_value('brand', brand) yield self.add_shipping_cost(loader.load_item())
def parse(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//div[@id="noResultsMsg"]') \ or not hxs.select('//td[text()="Mfr. Model #"]/following-sibling::td/text()'): return loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//div[@id="PageTitle"]/h1//text()') loader.add_value('url', response.url) loader.add_xpath( 'price', '//td[@class="tdrightalign"]/strong[starts-with(text(), "$")]/text()' ) loader.add_xpath( 'sku', '//td[text()="Mfr. Model #"]/following-sibling::td/text()') sku = loader.get_output_value('sku') if sku.lower() != response.meta['sku'].lower(): return yield loader.load_item()
def parse_product(self, response): if 'Server is encountered an error' in response.body: return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=response.meta['product'], selector=hxs) loader.add_value( 'identifier', re.search( "shopee.product.rating.showProductRating\(.*'([^']+)','ecshopfx_rating_container'\);", response.body).group(1)) # loader.add_value('identifier', response.url.split('/')[-1].split('?')[0]) if not loader.get_output_value('identifier'): return loader.add_xpath('sku', '//span[@id="ecshopfx_product_serial_value"]/text()') loader.add_value('url', response.url) loader.add_xpath('name', '//h3[@id="producttitle"]/text()') loader.add_value( 'category', re.findall( "shopee.breadcrumb.addToBreadCrumbs\('breadcrumb_container','([^']+)'", response.body.decode('utf8'))[:-1]) img = [ '/elko/upload/images/products/ecshop_zoom_' + loader.get_output_value('sku') + '.jpg' ] if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_xpath( 'brand', 'normalize-space(//td[contains(text(),"Framlei")]/following-sibling::td/text())' ) if not loader.get_output_value('brand'): loader.add_value('brand', loader.get_output_value('name').split()[0]) yield self.add_shipping_cost(loader.load_item())