def test_url_query_cleaner(self): self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired", 'id'), 'product.html?id=200') self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']), 'product.html?id=200&name=wired') self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo']), 'product.html?id=200&foo=bar')
def parse_products(self, response): category = response.css('.breadcrumbs').xpath( './/a/text()').extract()[1:] products = response.css('.listing_item') for product in products: loader = ProductLoader(item=Product(), selector=product) image_url = product.css('.listing_item_image').xpath( 'img/@src').extract_first() if not 'noimage' in image_url: loader.add_value('image_url', image_url) url = product.css('.listing_item_name').xpath( '@href').extract_first() url = url_query_cleaner(response.urljoin(url)) sku = url.split('/')[-1] loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', url) loader.add_xpath('name', './/a[@class="listing_item_name"]/text()') loader.add_xpath( 'price', './/span[@class="listing_item_basic_price"]/text()') loader.add_value('category', category) shipping_cost = product.css('.listing_item_delivery_costs').xpath( 'text()').extract_first() loader.add_value('shipping_cost', extract_price_eu(shipping_cost)) if 'Non disponibile' in product.css( '.listing_item_availability').xpath( 'text()').extract_first(): loader.add_value('stock', 0) item = loader.load_item() dealer = product.css('.listing_item_merchant_name').xpath( 'img/@alt').extract_first() item['metadata'] = {'Dealer': dealer} yield item
def parse_job_detail(self, response): hxs = HtmlXPathSelector(response) title = hxs.select("//span[@class='header']/text()").extract_unquoted() company = hxs.select("//span[@class='fontblacklarge']/b/text()").extract_unquoted() if title and company: city = hxs.select("//span[@class='fontblacklarge']/b[2]/text()").extract_unquoted() category = response.request.meta['category'] published_date = hxs.select("//span[@class='fontblacklarge']/../../following-sibling::tr[2]/td/b/text()").extract_unquoted() item=JobItem() images_url = hxs.select("//span[@class='fontblacklarge']/../following-sibling::td[2]/img/@src").extract() if images_url: item.load_image(self.get_base_url(response, images_url[0])) loader = JobLoader(item) loader.add_value('title', title) loader.add_value('company', company) loader.add_value('category', category) loader.add_value('city', city) loader.add_value('details_url', url_query_cleaner(response.url, ('najdi', 'id'))) loader.add_value('published_date', published_date) loader.add_value('id', self.generate_id(response.url, ('najdi', 'id'))) loader.add_value('content', response.body_as_unicode()) loader.add_value('source', self.name) loader.add_value('source_label', self.label) yield loader.load_item()
def parse_job_detail(self, response): hxs = HtmlXPathSelector(response) title = hxs.select("//div[@id='businessmyaccountjobadpreviewmain']/div/font/text()").extract_unquoted() company = hxs.select("//li[@class='propertiesleft' and contains(text(),'Podjetje:')]/following-sibling::li/text()").extract_unquoted() if title and company: city = hxs.select("//li[@class='propertiesleft' and contains(text(),'Regija in kraj dela:')]/following-sibling::li/text()").extract_unquoted() category = hxs.select(u"//li[@class='propertiesleft2' and contains(text(),'Področje dela:')]/following-sibling::li/text()".encode('utf-8')).extract_unquoted() images_url = hxs.select("//img[@id='mainimage']/@src").extract() item=JobItem() if images_url: item.load_image(self.get_base_url(response, images_url[0])) loader = JobLoader(item) loader.add_value('title', title) loader.add_value('company', company) loader.add_value('category', category) loader.add_value('city', city) loader.add_value('details_url', url_query_cleaner(response.url)) loader.add_value('published_date', hxs.select("//li[@class='dates']/text()").re(r".*:\s+(.*)")) loader.add_value('id', self.generate_id(response.url)) loader.add_value('content', response.body_as_unicode()) loader.add_value('source', self.name) loader.add_value('source_label', self.label) yield loader.load_item()
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//ul[@class='pagination']/li[@class='selected']//following-sibling::li[1]/a/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job, meta={'category': response.request.meta['category']}) for job in hxs.select("//ul[@id='newJobs']/li"): name = job.select("p[@class='jobTitle']/a/text()").extract_unquoted() company = job.select("strong/text()").extract_unquoted() if name and company: detail_url = job.select("p[@class='jobTitle']/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = job.select("div[@class='jobImgDiv']/img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = response.request.meta['category'] item['summary'] = job.select("p[2]/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = job.select("span[1]/text()").re(r".*:\s(.*)") if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()') brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"') loader.add_value('brand', brand) loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src') loader.add_value('url', url_query_cleaner(response.url)) loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value') item = loader.load_item() if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'): data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract() data = data.replace('\n', '').replace('null', '"null"') data = re.search('stockMatrix = (.*?);', data, re.DOTALL) data = json.loads(data.group(1)) if data else [] for i, variant in enumerate(data): sku = [elem for elem in variant if elem.startswith('sku')][0] sku_idx = variant.index(sku) product = Product(item) product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title() product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i) product['sku'] = product['identifier'] product['price'] = variant[sku_idx + 2] product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0 yield product return loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER')) loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value') stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0 loader.add_value('stock', stock) yield loader.load_item()
def closing_parse_simple(self, response): for item in super(Bike24Spider, self).closing_parse_simple(response): if isinstance(item, Product): if 'shipping_cost' in item: del item['shipping_cost'] # Normalize URL item['url'] = url_query_cleaner(item['url'], parameterlist=('content', 'product'), sep=';') yield item
def test_url_query_cleaner(self): self.assertEqual('product.html?id=200', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'])) self.assertEqual('product.html?id=200', url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id'])) self.assertEqual('product.html', url_query_cleaner("product.html?foo=bar&name=wired", ['id'])) self.assertEqual('product.html?id=200&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])) self.assertEqual('product.html?id', url_query_cleaner("product.html?id&other=3&novalue=", ['id'])) self.assertEqual('product.html?d=1&d=2&d=3', url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)) self.assertEqual('product.html?id=200&foo=bar', url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo'])) self.assertEqual('product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)) self.assertEqual('product.html?name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)) self.assertEqual('product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True))
def parse_post(self, response): image_urls = [] hxs = HtmlXPathSelector(response) # scrap each row in the table posts = hxs.select('//div[@class="l_post"]') for post in posts: # only one image at most in fact images = post.select('.//div[@class="p_content"]/img/@src') for image in images: url = image.extract() url = urljoin_rfc(response.url, url) url = url_query_cleaner(url, []) print 'url is %s' % url image_urls.append(url) # End of For #End of For item = ImageItem() item['image_urls'] = image_urls yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h2[@class="product-title"]/text()') identifier = url_query_parameter(response.url, 'ProductID') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) url = url_query_cleaner(response.url, ('ProductName', 'ProductID')) loader.add_value('url', url) loader.add_xpath( 'price', '//div[contains(@class, "product-details")]//span[@class="price"]/text()' ) image_url = response.xpath( '//img[@class="prodImg"]/@src').extract_first() loader.add_value('image_url', response.urljoin(image_url)) stock = response.xpath('//div[@id="MasterCopy_Instock"]/h4/text()').re( '\d+') if stock: loader.add_value('stock', stock[0]) else: loader.add_value('stock', 0) yield loader.load_item()
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//div[@class='PagedList-pager']//ul/li[contains(@class, 'PagedList-currentPage')]/following-sibling::li[1]/a/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job) category = hxs.select("//form[@id='searchForm']//select[@name='wfid']/option[@selected='selected']/text()").extract() informations = hxs.select("//table[@class='job-add-listing']//tr//div[@class='job-add-item-inner']") for information in informations: name = information.select("h2/a/text()").extract() company = information.select("p[3]/strong/text()").extract() if name and company: detail_url = information.select("h2/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = information.select("div[contains(@class,'city-logo')]/img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = category item['summary'] = information.select("p[2]/text()").extract() item['city'] = information.select("p[1]/text()").extract() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = information.select("div[contains(@class,'city-logo')]/div/text()").extract() if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse_job(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//span[@class='stevilke']/a[contains(@class, 'active')]/following-sibling::a[1]/@href").extract() if next_page: next_page = self.get_base_url(response, next_page[0]) if next_page != self.get_base_url(response, response.request.url): yield Request(url=next_page, callback=self.parse_job) category = hxs.select( "//input[@type='checkbox' and contains(@class, 'iskalnik_kriteriji_tip_sektor') and @checked]" + "//following-sibling::label[1]/text()" ).extract_unquoted() for job in hxs.select("//tr[@class='bg_oglas_dm']"): name = job.select("td[@class='ena']/div/a/b/text()").extract_unquoted() company = job.select("td[@class='dva']/a/text()").extract_unquoted() if name and company: detail_url = job.select("td[@class='ena']/div/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: images_url = job.select("td[@class='stiri']//img/@src").extract() item = JobItem() item['title'] = name item['company'] = company item['category'] = category item['city'] = job.select("td[@class='tri']/a/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url) item['published_date'] = job.select("td[@class='stiri']//div[2]/text()").re(r"\s+(\d{2}.\d{2}.\d{4})\s+") if images_url: item.load_image(self.get_base_url(response, images_url[0])) yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def parse(self, response): hxs = HtmlXPathSelector(response) next_page = hxs.select("//div[contains(@class, 'pagination')]/span/following-sibling::a[1]/@href").re(r"'(.*)','(.*)'") if next_page: yield Request( url=response.url, method='POST', headers= { 'content-type' : 'application/x-www-form-urlencoded; charset=utf-8', 'x-requested-with' : 'XMLHttpRequest', 'x-microsoftajax' : 'Delta=true' }, body=self.build_formdata({ '__EVENTTARGET' : next_page[0], '__EVENTARGUMENT': next_page[1], }), callback=self.parse ) for add in hxs.select("//div[@class='cc-gv']/table/tbody//tr"): name = add.select("td[1]/a/text()").extract_unquoted() company = add.select("td[2]/text()").extract_unquoted() if name and company: detail_url = add.select("td[1]/a/@href").extract() detail_url = self.get_base_url(response, detail_url[0]) if detail_url: item = JobItem() item['title'] = name item['company'] = company item['published_date'] = add.select("td[3]/text()").extract_unquoted() item['details_url'] = url_query_cleaner(detail_url, ('IDEPD')) item['city'] = add.select("td[4]/text()").extract_unquoted() yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})
def generate_id(self, detail_url, parameterlist=()): detail_url = url_query_cleaner(detail_url, parameterlist) detail_url = ''.join([self.name, detail_url]).encode('utf-8') return hashlib.md5(detail_url).hexdigest()
def parse_product(self, response): identifier = response.xpath( '//form[@id="pdAddToCart"]//input[@name="product"]/@value' ).extract() if not identifier: return loader = ProductLoader(item=Product(), response=response) # Normalize URL product_url = url_query_cleaner(response.url, parameterlist=('content', 'product'), sep=';') loader.add_value('url', product_url) loader.add_value('identifier', identifier) sku = response.xpath( '//table[@class="table-bordered table-striped table-product-datasheet"]' '//td[text()="Item Code:"]/following-sibling::td[1]/text()' ).extract() if sku: loader.add_value('sku', sku[0]) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = response.xpath( '//div[@class="box-price js-price"]/span[@itemprop="price"]/text()' ).extract() if price: price = extract_price(price[0].strip().replace('.', '').replace( ',', '.')) loader.add_value('price', price) else: loader.add_value('price', '0.0') image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) brand = response.xpath( '//table[@class="table-bordered table-striped table-product-datasheet"]' '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()' ).extract() if brand: loader.add_value('brand', brand[0]) category = response.xpath( '//ul[@class="nav"]//li[contains(@class,"item-active")]/a/text()' ).extract() if category: loader.add_value('category', category) availability = response.xpath( '//*[@id="js-availability-label"]/text()').extract() if availability and 'unknown' in availability[0].lower(): loader.add_value('stock', 0) product = loader.load_item() options = response.xpath( '//div[@class="input-group input-group-select"]/select') if not options: if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (product['identifier'] not in self.matched_identifiers)): if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product return for sel in options: opt = '' select_name = sel.xpath('@name').extract() if select_name: opt = select_name[0].replace('opt_', '') for option in sel.xpath('option[@value!="-2"]'): item = Product(product) opt_id = option.xpath('@value').extract() if opt_id: item['identifier'] += '-' + opt + '-' + opt_id[0] item['stock'] = 1 opt_stock = option.xpath('@data-av').extract() if opt_stock and opt_stock[0] == '100': item['stock'] = 0 opt_name = option.xpath('text()').extract() if opt_name: item['name'] += ' - ' + opt_name[0] if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (item['identifier'] not in self.matched_identifiers): continue if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) categories = response.xpath( '//ul[@class="breadcrumbList"]/li[@itemprop="itemListElement"]//span[@itemprop="name"]/text()' ).extract()[1:] loader.add_value('category', categories) brand = response.xpath('//script[@type="text/javascript"]/text()').re( 'brand: *\"(.+)\"') loader.add_value('brand', brand) loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src') loader.add_value('url', url_query_cleaner(response.url)) loader.add_xpath( 'name', '//input[@name="speedtrapProductDisplayName"]/@value') identifier = response.xpath('//text()').re("productId: '(.*)'")[0] loader.add_value('identifier', identifier) sku = response.xpath('//span[@id="productEAN"]/text()').extract() sku = sku[-1].strip() if sku else '' loader.add_value('sku', sku) loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value') stock = 1 if response.xpath( '//meta[@property="product:availability"]/@content[.="In Stock"]' ) else 0 loader.add_value('stock', stock) loader.add_value('shipping_cost', 3.99) item = loader.load_item() options = response.xpath( '//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]' ) if options: data = response.xpath( '//script[contains(text(),"stockMatrix =")]/text()' )[0].extract() data = data.replace('\n', '').replace('null', '"null"') data = re.search('stockMatrix = (.*?);', data, re.DOTALL) data = json.loads(data.group(1)) if data else [] for i, variant in enumerate(data): sku = [elem for elem in variant if elem.startswith('sku')][0] sku_idx = variant.index(sku) product = Product(item) product['name'] = item['name'] + ' - ' + ' '.join( variant[:sku_idx]).title() product['identifier'] += '-' + sku product['price'] = extract_price(str(variant[sku_idx + 2])) if not ('Available#Delivery' in variant[sku_idx + 1] or 'In stock#' in variant[sku_idx + 1] or 'Low stock#' in variant[sku_idx + 1]): product['stock'] = 0 image_code = response.xpath( '//li[input[@value="' + variant[0] + '"]]/input[@class="colourImageUrl"]/@value').extract() if image_code: image_url = 'http://media.very.co.uk/i/very/' + image_code[ 0] product['image_url'] = image_url yield product else: yield item
def parse_product(self, response): data = response.xpath( '//script/text()[contains(., "product/data")]').extract_first() rdata = response.xpath( '//script/text()[contains(., "window.__WML_REDUX_INITIAL_STATE__")]' ).extract_first() if not data: if rdata: for product in self.parse_product_rdata(response): yield product else: retries = response.meta.get('retries', 0) if retries < 20: self.logger.warning('No product data on %s. Retrying.' % response.url) yield Request(response.url, self.parse_product, meta={'retries': retries + 1}, dont_filter=True) else: self.logger.warning( 'No product data found on %s. Gave up retrying' % response.url) return data = json.loads( re.search('product/data",[ \n]*({.+})', data).group(1)) loader = ProductLoader(item=Product(), response=response) product_id = response.xpath( '//form[@name="SelectProductForm"]/input[@name="product_id"]/@value' ).extract() if product_id: identifier = product_id[0] else: identifier = url_query_cleaner(response.url).split('/')[-1] identifier = identifier.split('?')[0] loader.add_value('identifier', identifier) loader.add_value('sku', identifier) name = filter( lambda n: n, map(unicode.strip, response.xpath('//h1[@itemprop="name"]//text()').extract())) if not name: name = filter( lambda n: n, map( unicode.strip, response.xpath( '//h1[contains(@class,"product-name")]//text()'). extract())) if name: loader.add_value('name', name[0].strip()) #loader.add_xpath('name', '//option[@selected and not(@disabled)]/text()') loader.add_css('brand', 'a.product-brand span::text') categories = response.xpath( '//div[@itemprop="breadcrumb"]//span[@itemprop="title"]/text()' ).extract() if not categories: categories = response.xpath( '//div[@itemprop="breadcrumb"]//span[@itemprop="name"]/text()' ).extract() if categories: if 'Home' in categories: categories.remove('Home') loader.add_value('category', categories) elif 'category' in response.meta: loader.add_value('category', response.meta['category']) loader.add_value('url', response.url) price = response.xpath('//@data-product-price').extract_first() if price: price = [price] if price else None if not price: price = response.xpath( '//div[@id="WM_PRICE"]//*[contains(@class,"camelPrice")]/span/text()' ).extract() if not price: price = response.xpath( '//div[@class="onlinePriceMP"]//*[contains(@class,"camelPrice")]/span/text()' ).extract() if not price: price = response.xpath( '//div[@itemprop="offers"]/div[contains(@class, "product-price")]//*[@itemprop="price"][1]//text()' ).extract() if not price: price = response.xpath( '//div[@class="col5"]//div[contains(@class,"product-buying-table-row")][1]//div[contains(@class,"price-display")][1]//text()' ).extract() if not price: price = response.xpath('//*[@itemprop="price"]//text()').extract() price = ''.join(price).strip() if price else 0 loader.add_value('price', price) stock = response.xpath( '//meta[@itemprop="availability"]/@content').extract_first() if not stock or stock != 'InStock': loader.add_value('stock', 0) image = response.xpath( '//div[@class="LargeItemPhoto215"]//img/@src').extract() if not image: image = response.xpath( '//div[contains(@class,"product-images")][1]//img/@src' ).extract() if image: loader.add_value('image_url', image[0]) try: loader.add_value( 'shipping_cost', data['buyingOptions']['shippingPrice']['displayPrice']) except KeyError: loader.add_css('shipping_cost', 'h2.js-shipping-primary-msg::text') if not data or not data.get('variantInformation'): yield loader.load_item() return if url_query_parameter(response.url, 'selected'): if response.css('div.product-buying-table').xpath( './/div[contains(., "Information unavailable")]' ) or price == 0: retries = response.meta.get('retries', 0) if retries < 9: yield Request(response.url, self.parse_product, meta={'retries': retries + 1}, dont_filter=True) return for option in data['variantInformation']['variantTypes']: try: loader.add_value('name', option['selectedValue']) except KeyError: pass yield loader.load_item() return for variant in data['variantInformation']['variantProducts']: try: option_id = variant['buyingOptions']['usItemId'] except KeyError: continue url = '/'.join(response.url.split('/')[:-1]) url += '/%s' % option_id yield Request(add_or_replace_parameter(url, 'selected', 'True'), self.parse_product)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return identifier = response.xpath( '//form[@id="pdAddToCart"]//input[@name="product"]/@value' ).extract() if not identifier: return base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) # Normalize URL product_url = url_query_cleaner(response.url, parameterlist=('content', 'product'), sep=';') loader.add_value('url', product_url) loader.add_value('identifier', identifier[0]) sku = response.xpath( '//td[text()="Item Code:"]/following-sibling::td[1]/text()' ).extract() if sku: loader.add_value('sku', sku[0]) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = response.xpath('//span[@itemprop="price"]/text()').extract() if price: price = extract_price(price[0].strip().replace('.', '').replace( ',', '.')) loader.add_value('price', self.convert_to_pounds(str(price))) else: loader.add_value('price', '0.0') image_url = response.xpath('//img[@itemprop="image"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) brand = response.xpath( '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()' ).extract() if brand: loader.add_value('brand', brand[0]) category = response.xpath( '//main//span[@class="text-title"]/text()').extract() if category: loader.add_value('category', category[0].split(':')[0].strip()) availability = response.xpath( '//div[@class="pd-availability"]/span[contains(text(),"Delivery")]/text()' ).extract() if availability and 'unknown' in availability[0].lower(): loader.add_value('stock', 0) product = loader.load_item() options = response.xpath('//form[@id="pdAddToCart"]//select') if not options: if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (product['identifier'] not in self.matched_identifiers)): if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product return for sel in options: opt = '' select_name = sel.xpath('@name').extract() if select_name: opt = select_name[0].replace('opt_', '') for option in sel.xpath('option[@value!="-2"]'): item = Product(product) opt_id = option.xpath('@value').extract() if opt_id: item['identifier'] += '-' + opt + '-' + opt_id[0] item['stock'] = 1 if option.xpath('@data-av') == '100': item['stock'] = 0 opt_name = option.xpath('text()').extract() if opt_name: item['name'] += ' - ' + opt_name[0] opt_surcharge = option.xpath('@data-surcharge').extract() if opt_surcharge: item['price'] += extract_price(opt_surcharge[0]) if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \ and (item['identifier'] not in self.matched_identifiers): continue if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item