def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body) products = soup.findAll('a', href=re.compile('ProductDetail')) products = {product.parent.parent for product in products} for product in products: product_loader = ProductLoader(item=Product(), response=response) name = product.findAll('font')[1].text price = product.find('nobr', text=re.compile('\$')) url = product.find('a', href=re.compile('ProductDetail')) if url: url = urljoin_rfc(get_base_url(response), url['href']) else: url = response.url product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('url', url) product_loader.add_value('sku', response.meta['sku']) #product_loader.add_value('identifier', response.meta['sku']) site_mfrgid = product.find('nobr').text if site_mfrgid: site_mfrgid = site_mfrgid.strip().lower() mfrgid = response.meta['mfrgid'].strip().lower() if site_mfrgid == mfrgid: yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body) products = soup.findAll('a', href=re.compile('ProductDetail')) products = {product.parent.parent for product in products} for product in products: product_loader = ProductLoader(item=Product(), response=response) name = product.findAll('font')[1].text price = product.find('nobr', text=re.compile('\$')) url = product.find('a', href=re.compile('ProductDetail')) if url: url = urljoin_rfc(get_base_url(response), url['href']) else: url = response.url product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('url', url) product_loader.add_value('sku', response.meta['sku']) #product_loader.add_value('identifier', response.meta['sku']) site_mfrgid = product.find('nobr').text if site_mfrgid: site_mfrgid = site_mfrgid.strip().lower() mfrgid = response.meta['mfrgid'].strip().lower() if site_mfrgid == mfrgid: yield product_loader.load_item()
def parse(self, response): soup = BeautifulSoup(response.body) next_page = soup.find('a', 'pagnNext') if next_page: next_page = urljoin_rfc(get_base_url(response), next_page['href']) yield Request(next_page, meta=response.meta) hxs = HtmlXPathSelector(response) next_page = hxs.select('//a[@id="pagnNextLink"]/@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0]), meta=response.meta) products = soup.findAll('div', id=re.compile(u'^result_.*')) for product in products: # parent_expressions = (lambda tag: tag.name == 'h3' and tag.get('class') == 'title', # lambda tag: tag.name == 'div' and tag.get('class') == 'productTitle') url = product.find('h3', 'newaps').find('a') if product.find( 'h3', 'newaps') else '' if url: url = urljoin_rfc(get_base_url(response), url['href']) yield Request(url, meta=response.meta, callback=self.parse_options) for result in hxs.select( u'//div[@id="atfResults" or @id="btfResults"]//div[starts-with(@id, "result_")]' ): try: url = result.select(u'.//h3/a/@href').extract()[0] except: continue yield Request(url, meta=response.meta, callback=self.parse_options)
def parse_items(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cur_page = hxs.select('//span[@class="currentPage"]/text()').extract() if cur_page and (int(cur_page[0]) != response.meta['cur']) and ( response.meta['attempt'] < 5): log.msg('WRONG PAGE! ONE MORE ATTEMPT to ' + response.url) yield Request(response.url + '&at=' + str(response.meta['attempt']), meta={ 'cur': response.meta['cur'], 'attempt': response.meta['attempt'] + 1 }, dont_filter=True, callback=self.parse_items) return soup = BeautifulSoup(response.body) products = [ a['href'] for a in soup.findAll(lambda tag: tag.name == 'a' and tag.findChild('b') and tag.findParent('td', {'colspan': 2})) ] for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product) """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')
def parse_brand(self, response): hxs = HtmlXPathSelector(response) # if nothing found try to reload page if hxs.select('//div[@class="detailPageTitle"][text()="Viewing 0"]'): req = self.retry(response) if req: yield req return soup = BeautifulSoup(response.body) products = hxs.select('//ul[@class="stockthumbwrapper"]') for p in products: url = p.xpath( './/li[@class="productThumbName"]/a/@href')[0].extract() meta = response.meta.copy() promo = p.xpath( './/li[@class="productThumbImage"]//img[contains(@class,"cornerImgFormat2 discount")]/@alt' ).extract() meta['promotions'] = promo[0] if promo else '' yield Request(urljoin(get_base_url(response), url), callback=self.parse_product, meta=response.meta) for p in soup.findAll('ul', 'stockthumbwrapper'): url = p.find('li', 'productThumbName').find('a')['href'] meta = response.meta.copy() promo = p.find('li', 'productThumbImage').find( 'img', attrs={'class': re.compile('cornerImgFormat2 discount')}) meta['promotions'] = promo['alt'] if promo else '' yield Request(urljoin(get_base_url(response), url), callback=self.parse_product, meta=meta) pages = soup.findAll('div', id='pagenumber') if pages: for page in set(pages[0].findAll('a')): yield Request(response.urljoin(page), meta=response.meta, callback=self.parse_brand) for page in set( hxs.select('//div[@id="pagenumber"][1]/a/@href').extract()): yield Request(response.urljoin(page), meta=response.meta, callback=self.parse_brand)
def parse_product(self, response): soup = BeautifulSoup(response.body) # product list page products = soup.findAll('a', {'class': 'products-list__item'}) if products: for r in self.parse_category(response): yield r return # discontinued product discontinued = response.xpath( "//div[contains(@class, 'discontinued')]") if not discontinued: discontinued = 'Discontinued Product' in response.body if discontinued: return name = response.xpath("//h1[@itemprop='name']/text()").extract() if not name: name = soup.find('h1', {'itemprop': 'name'}).text price = re.findall( '"per_box_price_formated":"<span class=\\\\"price\\\\">\\\\u[\da-f]{4}([\d\.]*)<\\\\/span>",', response.body_as_unicode())[0] stock = None brand = response.xpath('//span[@itemprop="manufacturer"]/text()').re( 'by (.*)') if not brand: brand = soup.find('span', { 'itemprop': 'manufacturer' }).text.split('by ')[-1].strip() sku = re.search('"sku":"([^"]*)","product_id"', response.body_as_unicode()).group(1) identifier = re.search('"product_id":"([^"]*)"', response.body_as_unicode()).group(1) image_url = response.xpath("//img[@class='prod-image']/@src").extract() if not image_url: image_url = soup.find('img', {'itemprop': 'image'})['src'] cats = [] for el in response.xpath("//ul[@class='gl3-breadcrumbs']/li")[1:-1]: cats.append(''.join(el.xpath('.//text()').extract()).strip()) shipping_cost = '2.98' if float(price) < 49 else '0' loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('category', cats) loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse_category(self, response): soup = BeautifulSoup(response.body) products = soup.findAll('a', {'class': 'products-list__item'}) for product_url in products: yield Request(product_url['href'], callback=self.parse_product) identifier = re.search('"product_id":"([^"]*)"', response.body_as_unicode()) if not products and identifier: for item in self.parse_product(response): yield item
def parse(self, response): soup = BeautifulSoup(response.body) categories = response.xpath( '//div[contains(@class, "menu")]/@data-href').extract() categories += response.xpath( '//ul[contains(@class, "menu")]//a/@href').extract() for cat_url in categories: yield Request(response.urljoin(cat_url), callback=self.parse_category) categories = soup.findAll('a', {'class': 'link'}) for cat_url in categories: yield Request(response.urljoin(cat_url['href']), callback=self.parse_category)
def parse_items(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cur_page = hxs.select('//span[@class="currentPage"]/text()').extract() if cur_page and (int(cur_page[0]) != response.meta["cur"]) and (response.meta["attempt"] < 5): log.msg("WRONG PAGE! ONE MORE ATTEMPT to " + response.url) yield Request( response.url + "&at=" + str(response.meta["attempt"]), meta={"cur": response.meta["cur"], "attempt": response.meta["attempt"] + 1}, dont_filter=True, callback=self.parse_items, ) return soup = BeautifulSoup(response.body) products = [ a["href"] for a in soup.findAll( lambda tag: tag.name == "a" and tag.findChild("b") and tag.findParent("td", {"colspan": 2}) ) ] for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product) """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')
def parse_product(self, response): soup = BeautifulSoup(response.body) if not soup.find('div', attrs={'class': 'product'}): retry_request = _retry_page(response) if retry_request: yield retry_request else: self.log( "Error parsing page, couldn't extract product name: %s" % response.url) return main_name = soup.find('div', attrs={'class': 'product'}).h1.text main_name = remove_entities(main_name) brand_el = soup.find( lambda tag: tag.name == 'td' and 'brand' in tag.text.lower()) brand = brand_el.findNextSibling('td').text.strip() if brand_el else '' cat_names = [ span.a.text for span in soup.find('div', attrs={ 'class': 'breadcrumbtrail' }).span.findAll('span') if span.a ][2:] image_url = soup.find('img', {'itemprop': 'image'}) image_url = image_url['src'] if image_url else None table = soup.find('table', id='responsive-table') options = soup.findAll('div', attrs={'class': 'option'}) if table: for row in table.findAll('tr'): # Skip head row if not row.td: continue name = row.find('span', attrs={'class': 'name'}).text name = remove_entities(name) if not _main_name_in_opt_name(main_name, name): name = main_name + ' ' + name identifier = row.find('span', attrs={'class': 'codenumber'}) if not identifier: self.errors.append( "Identifier not found for products on page: %s" % response.url) continue identifier = identifier.text price = row.find(_is_price_tag).text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item() elif options: main_id = response.url.split('.')[-2].split('p-')[-1] price = soup.find('span', attrs={'class': 'inctax'}).span.text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None results = {} for opt in options: opt_name = opt.label.span.text results[opt_name] = [] for subopt in opt.select.findAll('option'): subopt_name = subopt.text subopt_value = _soup_el_get_attr(subopt, 'value') if subopt_value == '0': continue results[opt_name].append({ 'id': remove_entities(subopt_name).replace('"', ''), 'name': opt_name + ': ' + subopt_name }) for opt_tuple in product(*results.values()): name = _build_opt_name(main_name, opt_tuple) identifier = _build_opt_id(main_id, opt_tuple) loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item()
def parse(self, response): # using beautiful soup since the html is broken and cannot be parsed with lxml soup = BeautifulSoup(response.body) urls = soup.findAll('a', {'class': 'products-list__item'}) for url in urls: yield Request(url['href'], callback=self.parse_product, meta=response.meta)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # products = hxs.select(u'//div[@class="itemResultsRow"]') try: products = soup.findAll('div', attrs={'class': 'itemResultsRow'}) except AttributeError: products = [] for product in products: # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0] url = product.find('div', attrs={'class': 'itemTitle'}).find('a')['href'] url = urljoin_rfc(get_base_url(response), url) # dropdown = product.select(u'.//select[@name="mv_order_item"]') dropdown = product.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: try: # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip() except AttributeError: brand = u'' # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip() try: # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip() vintage_age = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'vintageAge'}).text.strip() except AttributeError: vintage_age = u'' # multiple_prices = product.select(u'.//td[@class="priceCell"]') multiple_prices = product.findAll('td', attrs={'class':'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()') try: price = option.find('p', attrs={'class': 'priceCellP salePriceP'}).find('span', attrs={'class': 'priceSale'}).text.strip() except AttributeError: price = option.find('p', attrs={'class': 'priceCellP'}).find('span', attrs={'class': 'priceRetail'}).text.strip() # if not price: # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()') # price = price[0].extract().strip() # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract() bottle_size = option.find('p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract() bottle_size = option.find(lambda tag: tag.name == 'span' and tag.get('class', '') == 'priceUnit' and tag.findParent('p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) if loader.get_output_value('price'): yield loader.load_item() else: # dropdown = dropdown[0] # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip() title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip() # for option in dropdown.select(u'./option/text()').extract(): for option in [option.text for option in dropdown.findAll('option')]: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) name = u'%s %s' % (brand, title) option = re.search(u'(.*?) \((.*)\)', option).groups() name += u' %s' % option[1] loader.add_value('name', name) loader.add_value('price', option[0]) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # products = hxs.select(u'//div[@class="itemResultsRow"]') try: products = soup.findAll('div', attrs={'class': 'itemResultsRow'}) except AttributeError: products = [] for product in products: # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0] url = product.find('div', attrs={ 'class': 'itemTitle' }).find('a')['href'] url = urljoin_rfc(get_base_url(response), url) # dropdown = product.select(u'.//select[@name="mv_order_item"]') dropdown = product.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: try: # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() brand = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'brand' }).text.strip() except AttributeError: brand = u'' # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() title = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'title' }).text.strip() try: # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip() vintage_age = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find( 'span', attrs={ 'class': 'vintageAge' }).text.strip() except AttributeError: vintage_age = u'' # multiple_prices = product.select(u'.//td[@class="priceCell"]') multiple_prices = product.findAll('td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()') try: price = option.find('p', attrs={ 'class': 'priceCellP salePriceP' }).find('span', attrs={ 'class': 'priceSale' }).text.strip() except AttributeError: price = option.find('p', attrs={ 'class': 'priceCellP' }).find('span', attrs={ 'class': 'priceRetail' }).text.strip() # if not price: # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()') # price = price[0].extract().strip() # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract() bottle_size = option.find( 'p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract() bottle_size = option.find( lambda tag: tag.name == 'span' and tag.get( 'class', '') == 'priceUnit' and tag.findParent( 'p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) if loader.get_output_value('price'): yield loader.load_item() else: # dropdown = dropdown[0] # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() brand = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'brand' }).text.strip() title = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'title' }).text.strip() # for option in dropdown.select(u'./option/text()').extract(): for option in [ option.text for option in dropdown.findAll('option') ]: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) name = u'%s %s' % (brand, title) option = re.search(u'(.*?) \((.*)\)', option).groups() name += u' %s' % option[1] loader.add_value('name', name) loader.add_value('price', option[0]) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) try: products = soup.findAll('div', attrs={'class': 'itemResultsRow'}) except AttributeError: products = [] if not products: single_product = True else: single_product = False for product in products: url = product.find('div', attrs={ 'class': 'itemTitle' }).find('a')['href'] url = urljoin_rfc(get_base_url(response), url) try: brand = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'brand' }).text.strip() except AttributeError: brand = u'' title = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'title' }).text.strip() try: # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip() vintage_age = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'vintageAge' }).text.strip() except AttributeError: vintage_age = u'' dropdown = product.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: multiple_prices = product.findAll('td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) try: price = option.find('p', attrs={ 'class': 'priceCellP salePriceP' }).find('span', attrs={ 'class': 'priceSale' }).text.strip() except AttributeError: price = option.find('p', attrs={ 'class': 'priceCellP' }).find('span', attrs={ 'class': 'priceRetail' }).text.strip() try: sku = option.find('p', attrs={ 'class': 'priceCellP itemid' }).text.strip() except AttributeError: sku = '' bottle_size = option.find( 'p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: bottle_size = option.find( lambda tag: tag.name == 'span' and tag.get( 'class', '') == 'priceUnit' and tag.findParent( 'p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) if loader.get_output_value('price'): yield loader.load_item() else: for option in dropdown.findAll('option'): loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) name = u'%s %s' % (brand, title) # option = re.search(u'(.*?) \((.*)\)', option.text).groups() option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$', option.text).groups() name += u' %s' % option[1] loader.add_value('name', name) loader.add_value('price', option[0]) loader.add_value('sku', option[2]) if loader.get_output_value('price'): yield loader.load_item() if single_product: url = response.url try: brand = soup.find('div', attrs={ 'class': 'itemTitle' }).find('span', attrs={ 'class': 'brand' }).text.strip() except AttributeError: brand = u'' title = soup.find('div', attrs={ 'class': 'itemTitle' }).find('span', attrs={ 'class': 'title' }).text.strip() try: vintage_age = soup.find('div', attrs={ 'class': 'itemTitle' }).find('span', attrs={ 'class': 'vintageAge' }).text.strip() except AttributeError: vintage_age = u'' dropdown = soup.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: multiple_prices = soup.find('div', attrs={ 'class': 'priceArea' }).findAll( 'td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) try: price = option.find('p', attrs={ 'class': 'priceCellP salePriceP' }).find('span', attrs={ 'class': 'priceSale' }).text.strip() except AttributeError: price = option.find('p', attrs={ 'class': 'priceCellP' }).find('span', attrs={ 'class': 'priceRetail' }).text.strip() try: sku = option.find('p', attrs={ 'class': 'priceCellP itemid' }).text.strip() except AttributeError: sku = '' bottle_size = option.find( 'p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: bottle_size = option.find( lambda tag: tag.name == 'span' and tag.get( 'class', '') == 'priceUnit' and tag.findParent( 'p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) if loader.get_output_value('price'): yield loader.load_item() else: for option in dropdown.findAll('option'): name = u'%s %s %s' % (brand, title, vintage_age) option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$', option.text).groups() price = option[0] name += u' %s' % option[1].strip() sku = option[2] loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(text=response.body_as_unicode()) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) identifier = hxs.select('//input[@id="catentryId"]/@value').extract() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = ''.join( hxs.select('//div[@itemprop="price"]//span[@class="price"]//text()' ).extract()).strip() loader.add_value('price', price) categories = hxs.select( '//ul[@class="breadcrumbs"]//li[not(@class="home")]/a/span/text()' ).extract()[1:] loader.add_value('category', categories) image_url = hxs.select('//img[@id="productMainImage"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) brand = hxs.select( '//li[contains(text(), "BRAND")]/span/text()').extract() loader.add_value('brand', brand) item = loader.load_item() if not item.get('name'): log.msg('Using BeautifulSoup: ' + response.url) loader = ProductLoader(response=response, item=Product()) soup = BeautifulSoup(response.body) loader.add_value('url', response.url) identifier = soup.find('input', attrs={'id': 'catentryId'}) identifier = _soup_el_get_attr(identifier, 'value') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) name = soup.find('h1', attrs={'itemprop': 'name'}).text loader.add_value('name', name) categories = [ li.a.span.text for li in soup.find('ul', attrs={ 'class': 'breadcrumbs' }).findAll('li') if li.a ][2:] loader.add_value('category', categories) price = soup.find('div', attrs={ 'itemprop': 'price' }).find('span', attrs={ 'class': 'price' }).text loader.add_value('price', price) image_url = soup.find('img', attrs={'id': 'productMainImage'}) if image_url: image_url = _soup_el_get_attr(image_url, 'src') loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url)) brand = '' for li in soup.findAll('li'): if 'BRAND' in li.text.upper(): brand = li.span.text break loader.add_value('brand', brand) item = loader.load_item() if item['identifier']: yield item else: if item['identifier']: yield item if not item.get('name'): request = self.retry(response, "No name for product: " + response.url) if request: yield request return
def parse_date(self, response): res = json.loads(response.body) if res['error']: return try: soup = BeautifulSoup(res['html']) except Exception: return all_prices = soup.findAll('td', {'class': 'table_desc'}) adult_price = None child_price = None adult_ids = ['adult'] child_ids = ['children', 'child', 'junior'] excluded_ids = ['concession', 'student', 'infant', 'niño'] remaining_prices = [] for p in all_prices: if not adult_price and 'adult' in p.text.lower(): adult_price = p.parent.findAll('td')[2].text elif not child_price and ('child' in p.text.lower() or 'junior' in p.text.lower()): child_price = p.parent.findAll('td')[2].text else: remaining_prices.append(p) if adult_price: loader = ProductLoader(item=Product(), selector=HtmlXPathSelector()) loader.add_value( 'identifier', response.meta['product_id'] + ':' + response.meta['date'] + ':Adult') loader.add_value('url', response.meta['url']) loader.add_value('sku', response.meta['date']) loader.add_value('category', response.meta['location']) loader.add_value('brand', 'Adult') loader.add_value('price', adult_price) loader.add_value('name', response.meta['name']) yield loader.load_item() if child_price: loader = ProductLoader(item=Product(), selector=HtmlXPathSelector()) loader.add_value( 'identifier', response.meta['product_id'] + ':' + response.meta['date'] + ':Child') loader.add_value('url', response.meta['url']) loader.add_value('sku', response.meta['date']) loader.add_value('category', response.meta['location']) loader.add_value('brand', 'Child') loader.add_value('price', child_price) loader.add_value('name', response.meta['name']) yield loader.load_item() for p in remaining_prices: exclude = False for t in excluded_ids: if t.decode('utf8') in p.text.lower(): exclude = True break if exclude: continue ticket_type = 'Adult' for t in child_ids: if t in p.text.lower(): ticket_type = 'Child' loader = ProductLoader(item=Product(), selector=HtmlXPathSelector()) option_name = p.text.lower() loader.add_value( 'identifier', response.meta['product_id'] + ':' + response.meta['date'] + ':' + ticket_type + ':' + option_name) loader.add_value('url', response.meta['url']) loader.add_value('sku', response.meta['date']) loader.add_value('category', response.meta['location']) loader.add_value('brand', ticket_type) loader.add_value('price', p.parent.findAll('td')[2].text) loader.add_value('name', response.meta['name'] + ' - ' + p.text) if loader.get_output_value('price'): yield loader.load_item()