def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body) products = soup.findAll('a', href=re.compile('ProductDetail')) products = {product.parent.parent for product in products} for product in products: product_loader = ProductLoader(item=Product(), response=response) name = product.findAll('font')[1].text price = product.find('nobr', text=re.compile('\$')) url = product.find('a', href=re.compile('ProductDetail')) if url: url = urljoin_rfc(get_base_url(response), url['href']) else: url = response.url product_loader.add_value('name', name) product_loader.add_value('price', price) product_loader.add_value('url', url) product_loader.add_value('url', url) product_loader.add_value('sku', response.meta['sku']) #product_loader.add_value('identifier', response.meta['sku']) site_mfrgid = product.find('nobr').text if site_mfrgid: site_mfrgid = site_mfrgid.strip().lower() mfrgid = response.meta['mfrgid'].strip().lower() if site_mfrgid == mfrgid: yield product_loader.load_item()
def collect_price(self, hxs, response): soup = BeautifulSoup(response.body) try: soup_form = soup.find(id='handleBuy') price = soup_form.find('b', 'priceLarge') if not price: price = soup_form.find('span', 'priceLarge') if not price: price = soup_form.find('span', 'price') if not price: price = soup_form.find('span', 'pa_price') if price: price = self.extract_price(price.text) else: price = None except: price = hxs.select('//div[@id="price"]//td[text()="Price:"]' '/following-sibling::td/span/text()').extract() if not price: price = hxs.select( '//span[@id="priceblock_saleprice"]/text()').extract() if not price: price = hxs.select( '//span[@id="priceblock_ourprice"]/text()').extract() if not price: price = hxs.select( '//span[@id="actualPriceValue"]/*[@class="priceLarge"]/text()' ).extract() if price: price = self.extract_price(price[0]) else: price = None return price
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body) products = soup.find('div', id='atfResults') if products: products = products.findAll('div', id=re.compile('result_\d+$')) meta = response.meta meta['next_results'] = [] # next_page = hxs.select(u'//a[@class="pagnNext"]/@href').extract() next_page = [] if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) meta['next_page'] = next_page for product in products: url = product.find('a')['href'] url = urljoin_rfc(get_base_url(response), url) meta['next_results'].append(url) first_url = meta['next_results'][0] meta['next_results'] = meta['next_results'][1:] yield Request(first_url, callback=self.parse_product, meta=meta, dont_filter=True) else: log.msg('No products.') meta = response.meta if meta.get('search_urls'): search_url = meta['search_urls'][0] meta['search_urls'] = meta['search_urls'][1:] yield Request(search_url % {'q': meta['sku']}, meta=meta)
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) brands = hxs.select("//section[@id='All']//ul[@class='BrandCol1']//a") soup = BeautifulSoup(response.body) for brand in brands: link = urljoin_rfc( base_url, brand.select("./@href").extract()[0].replace('//', '/')) log.msg(link) brand = brand.select("./text()").extract()[0] yield Request(url=link, meta={'brand': brand}, callback=self.parse_brand, priority=10) if soup.find('section', id='All'): brands = [(brand.text, brand['href']) for brand in soup.find('section', id='All').findAll('a')] for brand, link in brands: yield Request(urljoin_rfc(base_url, link.replace('//', '/')), meta={'brand': brand}, callback=self.parse_brand, priority=10)
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select( '//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value( 'name', soup.find('h3', attrs={ 'class': 'newaps' }).findAll('span')[0].string) loader.add_value( 'url', soup.find('h3', attrs={ 'class': 'newaps' }).findAll('a')[0]['href']) loader.add_value( 'price', soup.find('ul', attrs={ 'class': 'rsltL' }).findAll('span')[0].string) #loader.add_value('sku', response.meta['sku']) #loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') > loader.get_output_value('price')) and \ valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select('//table[@class="productListing"]/tr')[1:] if len(products) < 20: # if the product list can not be parsed using lxml, use BeautifulSoup soup = BeautifulSoup(response.body) products = soup.find('table', {'class': 'productListing'}).findAll('tr') products = products[1:] for product in products: product_loader = ProductLoader(item=Product(), response=response) product = product.findAll('td') name = product[1].find('a').contents url = product[1].find('a')['href'] price = product[2].text price = re.findall('[0-9\.]+', price) product_loader.add_value('name', name) product_loader.add_value('url', url) product_loader.add_value('price', price[0]) yield product_loader.load_item() else: for product in products: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './td[position()=2]/a/text()') product_loader.add_xpath('url', './td[position()=2]/a/@href') product_loader.add_xpath('price', './td[position()=3]', re='\xa3(.*[0-9])') yield product_loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # products products = hxs.select('//table[@class="productListing"]/tr')[1:] if len(products) < 20: # if the product list can not be parsed using lxml, use BeautifulSoup soup = BeautifulSoup(response.body) products = soup.find('table', {'class': 'productListing'}).findAll('tr') products = products[1:] for product in products: product = product.findAll('td') name = product[1].find('a').contents url = product[1].find('a')['href'] price = product[2].text price = re.findall('[0-9\.]+', price) if price[0] > 0: product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name) product_loader.add_value('url', url) product_loader.add_value('price', price[0]) yield product_loader.load_item() else: for product in products: price = product.select('./td[position()=3]').re('\xa3(.*[0-9])') if price and price[0] > 0: product_loader = ProductLoader(item=Product(), selector=product) product_loader.add_xpath('name', './td[position()=2]/a/text()') product_loader.add_xpath('url', './td[position()=2]/a/@href') product_loader.add_xpath('price', './td[position()=3]', re='\xa3(.*[0-9])') yield product_loader.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # categories # categories = hxs.select(u'//td[@id="left"]//a/@href').extract() # try: # categories = [a['href'] for a in soup.find('td', id='left').findAll('a')] # except AttributeError: # categories = [] # for url in categories: # url = urljoin_rfc(get_base_url(response), url) # yield Request(url) # pagination next_page = hxs.select(u'//div[@class="pager"]/a[contains(text(),"Next")]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) else: next_page = soup.find(lambda tag: tag.name == 'a' and 'Next' in tag.text and tag.findParent('div', 'pager')) if next_page: next_page = urljoin_rfc(get_base_url(response), next_page['href']) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) # products for product in self.parse_product(response): yield product
def parse_items(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cur_page = hxs.select('//span[@class="currentPage"]/text()').extract() if cur_page and (int(cur_page[0]) != response.meta['cur']) and ( response.meta['attempt'] < 5): log.msg('WRONG PAGE! ONE MORE ATTEMPT to ' + response.url) yield Request(response.url + '&at=' + str(response.meta['attempt']), meta={ 'cur': response.meta['cur'], 'attempt': response.meta['attempt'] + 1 }, dont_filter=True, callback=self.parse_items) return soup = BeautifulSoup(response.body) products = [ a['href'] for a in soup.findAll(lambda tag: tag.name == 'a' and tag.findChild('b') and tag.findParent('td', {'colspan': 2})) ] for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product) """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src') if not loader.get_output_value(u'image_url'): soup = BeautifulSoup(response.body) image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer')) if image_url: loader.add_value('image_url', image_url.get(u'src')) loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()') loader.add_xpath('price', u'//b[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="price"]/text()') loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku'].lower()) yield loader.load_item()
def parse(self, response): soup = BeautifulSoup(response.body) next_page = soup.find('a', 'pagnNext') if next_page: next_page = urljoin_rfc(get_base_url(response), next_page['href']) yield Request(next_page, meta=response.meta) hxs = HtmlXPathSelector(response) next_page = hxs.select('//a[@id="pagnNextLink"]/@href').extract() if next_page: yield Request(urljoin_rfc(get_base_url(response), next_page[0]), meta=response.meta) products = soup.findAll('div', id=re.compile(u'^result_.*')) for product in products: # parent_expressions = (lambda tag: tag.name == 'h3' and tag.get('class') == 'title', # lambda tag: tag.name == 'div' and tag.get('class') == 'productTitle') url = product.find('h3', 'newaps').find('a') if product.find( 'h3', 'newaps') else '' if url: url = urljoin_rfc(get_base_url(response), url['href']) yield Request(url, meta=response.meta, callback=self.parse_options) for result in hxs.select( u'//div[@id="atfResults" or @id="btfResults"]//div[starts-with(@id, "result_")]' ): try: url = result.select(u'.//h3/a/@href').extract()[0] except: continue yield Request(url, meta=response.meta, callback=self.parse_options)
def parse_brands(self, response): soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) brands = [ a['href'] for a in soup.find('td', id='main').findAll('a') if 'producers' in a['href'] ] for url in brands: url = urljoin_rfc(get_base_url(response), url) if '127.0.0.1' in url: url = url.replace('127.0.0.1', 'argonautliquor.com') yield Request(url, dont_filter=True)
def parse_category(self, response): soup = BeautifulSoup(response.body) products = soup.findAll('a', {'class': 'products-list__item'}) for product_url in products: yield Request(product_url['href'], callback=self.parse_product) identifier = re.search('"product_id":"([^"]*)"', response.body_as_unicode()) if not products and identifier: for item in self.parse_product(response): yield item
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body) products = soup.find('div', id='atfResults') if products: products = products.findAll('div', id=re.compile('result_\d+$')) url = products[0].find('a')['href'] url = urljoin_rfc(get_base_url(response), url) yield Request(url, meta=response.meta, callback=self.parse_product)
def parse_review(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) soup = BeautifulSoup(response.body) product = response.meta['product'] reviews = hxs.select( u'//table[@id="productReviews"]//div[@style="margin-left:0.5em;"]') if not reviews: yield product return for review in reviews: loader = ReviewLoader(item=Review(), selector=hxs, date_format=u'%d/%m/%Y') date = review.select(u'.//nobr/text()')[0].extract() res = None date_formats = (u'%B %d, %Y', u'%d %b %Y', u'%d %B %Y') for fmt in date_formats: try: res = time.strptime(date, fmt) except ValueError: pass if res: break date = time.strftime(u'%d/%m/%Y', res) loader.add_value('date', date) rating = review.select(u'.//text()').re( u'([\d\.]+) out of 5 stars')[0] rating = int(float(rating)) loader.add_value('rating', rating) loader.add_value('url', response.url) title = review.select(u'.//b/text()')[0].extract() text = ''.join([ s.strip() for s in review.select( u'div[@class="reviewText"]/text()').extract() ]) loader.add_value('full_text', u'%s\n%s' % (title, text)) product['metadata']['reviews'].append(loader.load_item()) next_page = soup.find('a', text=re.compile('Next')) if next_page and next_page.parent.get('href'): next_page = next_page.parent['href'] yield Request(urljoin_rfc(base_url, next_page), meta=response.meta, callback=self.parse_review) else: yield product
def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) soup = BeautifulSoup(response.body) try: price = soup.find('span', {'class': 'price ours'}).text except AttributeError: self.log('price not found {}'.format(response.url)) return image_url = soup.find('img', itemprop='image')['src'] identifier = soup.find('form', id='product_addtocart_form') identifier = identifier['action'].split('product/')[-1].split('/')[0] loader.add_value('image_url', image_url) loader.add_value('price', price) name = soup.find('h1', itemprop='name').text.strip() loader.add_value('name', name) loader.add_value('category', response.meta.get('category', '')) brand = soup.find('span', itemprop='manufacturer').text.replace(' ', '').split('by', 1)[1].strip() loader.add_value('brand', brand) loader.add_value('url', response.url) sku = soup.find('input', id='eye') loader.add_value('identifier', identifier) if sku: loader.add_value('sku', sku['value']) shipping_cost = '5.98' if loader.get_output_value('price') <= Decimal(59): shipping_cost = '9.98' loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse(self, response): soup = BeautifulSoup(response.body) categories = response.xpath( '//div[contains(@class, "menu")]/@data-href').extract() categories += response.xpath( '//ul[contains(@class, "menu")]//a/@href').extract() for cat_url in categories: yield Request(response.urljoin(cat_url), callback=self.parse_category) categories = soup.findAll('a', {'class': 'link'}) for cat_url in categories: yield Request(response.urljoin(cat_url['href']), callback=self.parse_category)
def parse_brand(self, response): hxs = HtmlXPathSelector(response) # if nothing found try to reload page if hxs.select('//div[@class="detailPageTitle"][text()="Viewing 0"]'): req = self.retry(response) if req: yield req return soup = BeautifulSoup(response.body) products = hxs.select('//ul[@class="stockthumbwrapper"]') for p in products: url = p.xpath( './/li[@class="productThumbName"]/a/@href')[0].extract() meta = response.meta.copy() promo = p.xpath( './/li[@class="productThumbImage"]//img[contains(@class,"cornerImgFormat2 discount")]/@alt' ).extract() meta['promotions'] = promo[0] if promo else '' yield Request(urljoin(get_base_url(response), url), callback=self.parse_product, meta=response.meta) for p in soup.findAll('ul', 'stockthumbwrapper'): url = p.find('li', 'productThumbName').find('a')['href'] meta = response.meta.copy() promo = p.find('li', 'productThumbImage').find( 'img', attrs={'class': re.compile('cornerImgFormat2 discount')}) meta['promotions'] = promo['alt'] if promo else '' yield Request(urljoin(get_base_url(response), url), callback=self.parse_product, meta=meta) pages = soup.findAll('div', id='pagenumber') if pages: for page in set(pages[0].findAll('a')): yield Request(response.urljoin(page), meta=response.meta, callback=self.parse_brand) for page in set( hxs.select('//div[@id="pagenumber"][1]/a/@href').extract()): yield Request(response.urljoin(page), meta=response.meta, callback=self.parse_brand)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) loader.add_xpath('name', u'//span[@id="btAsinTitle"]/text()') loader.add_value('url', response.url) loader.add_xpath('price', u'//b[@class="priceLarge"]/text()') loader.add_xpath('image_url', u'//tr[@id="prodImageContainer"]//img/@src') if not loader.get_output_value(u'image_url'): soup = BeautifulSoup(response.body) image_url = soup.find(lambda tag: tag.name == u'img' and tag.findParent(u'tr', id=u'prodImageContainer')) if image_url: loader.add_value('image_url', image_url.get(u'src')) loader.add_xpath('brand', u'//span[@class="tsLabel" and contains(text(),"Brand")]/following-sibling::span/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="priceLarge"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="price"]/text()') partn = hxs.select(u'//span[@class="tsLabel" and contains(text(),"Manufacturer Part Number")]/following-sibling::span/text()').extract() if not partn: partn = hxs.select(u'//tr/td[contains(text(),"Manufacturer Part Number")]/following-sibling::td/text()').extract() partn = partn[0].strip() log.msg('PARTN: [%s == %s]' % (partn.lower(), response.meta['partn'].lower())) log.msg('SKU: [%s == %s]' % (partn.lower(), response.meta['sku'].lower())) sold_by = hxs.select(u'//div[contains(text(),"Sold by")]/b/text()').extract() sold_by = sold_by[0].strip() if sold_by else u'' log.msg(u'Sold by: %s' % sold_by) if (partn.lower() == response.meta['partn'].lower() or partn.lower() == response.meta['sku'].lower()) and sold_by != u'Towequipe': loader.add_value('sku', response.meta['partn']) loader.add_value('identifier', response.meta['partn'].lower()) # if loader.get_output_value('price'): yield loader.load_item() else: meta = response.meta next_result = meta['next_results'] if next_result: next_result = next_result[0] meta['next_results'] = meta['next_results'][1:] yield Request(next_result, callback=self.parse_product, meta=response.meta) elif meta.get('next_page'): next_page = meta['next_page'] yield Request(next_page, meta=response.meta)
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # categories # categories = hxs.select(u'//td[@id="left"]//a/@href').extract() # try: # categories = [a['href'] for a in soup.find('td', id='left').findAll('a')] # except AttributeError: # categories = [] # for url in categories: # url = urljoin_rfc(get_base_url(response), url) # yield Request(url) # pagination next_page = hxs.select( u'//div[@class="pager"]/a[contains(text(),"Next")]/@href').extract( ) if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) else: next_page = soup.find(lambda tag: tag.name == 'a' and 'Next' in tag .text and tag.findParent('div', 'pager')) if next_page: next_page = urljoin_rfc(get_base_url(response), next_page['href']) if '127.0.0.1' in next_page: next_page = next_page.replace('127.0.0.1', 'argonautliquor.com') yield Request(next_page, dont_filter=True) # products for product in self.parse_product(response): yield product
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value('name', soup.find('h3', attrs={'class': 'newaps'}).findAll('span')[0].string) loader.add_value('url', soup.find('h3', attrs={'class': 'newaps'}).findAll('a')[0]['href']) loader.add_value('price', soup.find('ul', attrs={'class': 'rsltL'}).findAll('span')[0].string) loader.add_value('sku', response.meta['sku']) loader.add_value('identifier', response.meta['sku']) if loader.get_output_value('price'): if (pr is None or pr.get_output_value('price') > loader.get_output_value('price')): if valid_price(response.meta['price'], loader.get_output_value('price')): pr = loader if pr: yield pr.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) # pages next_page = hxs.select('//a[contains(text(), "Next")]/@href').extract() if next_page: url = urljoin_rfc(get_base_url(response), next_page[0]) yield Request(url) else: soup = BeautifulSoup(response.body) next_page = soup.find('a', text=re.compile('.*Next.*')) if next_page: url = urljoin_rfc(get_base_url(response), next_page.parent['href']) yield Request(url) # products for product in self.parse_product(response): yield product
def parse(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) products = hxs.select(u'//div[@id="atfResults"]//div[starts-with(@id,"result_")]') if products: meta = response.meta meta['next_results'] = [] next_page = hxs.select(u'//a[@class="pagnNext"]/@href').extract() if next_page: next_page = urljoin_rfc(get_base_url(response), next_page[0]) meta['next_page'] = next_page for product in products: url = product.select(u'.//a[@class="title"]/@href') if not url: url = product.select(u'.//h3[@class="newaps"]/a/@href') if url: url = url[0].extract() else: continue url = urljoin_rfc(get_base_url(response), url) soup = BeautifulSoup(product.extract()) price = soup.find('ul', attrs={'class': 'rsltL'}) if price: price = price.findAll('span')[0] if not price: price = soup.find('span', 'price addon') if not price: price = soup.find('span', 'price') if price: price = price.string.strip()[1:] if not price: price = '1000.00' meta['next_results'].append({'price': float(price), 'url': url}) meta['next_results'].sort(key=lambda elem: elem.get('price')) meta['next_results'] = [elem['url'] for elem in meta['next_results']] first_url = meta['next_results'][0] meta['next_results'] = meta['next_results'][1:] yield Request(first_url, callback=self.parse_product, meta=meta, dont_filter=True)
def parse_product(self, response): soup = BeautifulSoup(response.body) # product list page products = soup.findAll('a', {'class': 'products-list__item'}) if products: for r in self.parse_category(response): yield r return # discontinued product discontinued = response.xpath( "//div[contains(@class, 'discontinued')]") if not discontinued: discontinued = 'Discontinued Product' in response.body if discontinued: return name = response.xpath("//h1[@itemprop='name']/text()").extract() if not name: name = soup.find('h1', {'itemprop': 'name'}).text price = re.findall( '"per_box_price_formated":"<span class=\\\\"price\\\\">\\\\u[\da-f]{4}([\d\.]*)<\\\\/span>",', response.body_as_unicode())[0] stock = None brand = response.xpath('//span[@itemprop="manufacturer"]/text()').re( 'by (.*)') if not brand: brand = soup.find('span', { 'itemprop': 'manufacturer' }).text.split('by ')[-1].strip() sku = re.search('"sku":"([^"]*)","product_id"', response.body_as_unicode()).group(1) identifier = re.search('"product_id":"([^"]*)"', response.body_as_unicode()).group(1) image_url = response.xpath("//img[@class='prod-image']/@src").extract() if not image_url: image_url = soup.find('img', {'itemprop': 'image'})['src'] cats = [] for el in response.xpath("//ul[@class='gl3-breadcrumbs']/li")[1:-1]: cats.append(''.join(el.xpath('.//text()').extract()).strip()) shipping_cost = '2.98' if float(price) < 49 else '0' loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('stock', stock) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', identifier) loader.add_value('image_url', image_url) loader.add_value('category', cats) loader.add_value('shipping_cost', shipping_cost) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]') pr = None for product in products: loader = ProductLoader(item=Product(), selector=product) soup = BeautifulSoup(product.extract()) loader.add_value("name", soup.find("h3", attrs={"class": "newaps"}).findAll("span")[0].string) loader.add_value("url", soup.find("h3", attrs={"class": "newaps"}).findAll("a")[0]["href"]) loader.add_value("price", soup.find("ul", attrs={"class": "rsltL"}).findAll("span")[0].string) # loader.add_value('sku', response.meta['sku']) # loader.add_value('identifier', response.meta['sku']) if ( loader.get_output_value("price") and (pr is None or pr.get_output_value("price") > loader.get_output_value("price")) and valid_price(response.meta["price"], loader.get_output_value("price")) ): pr = loader if pr: yield pr.load_item()
def parse_items(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) cur_page = hxs.select('//span[@class="currentPage"]/text()').extract() if cur_page and (int(cur_page[0]) != response.meta["cur"]) and (response.meta["attempt"] < 5): log.msg("WRONG PAGE! ONE MORE ATTEMPT to " + response.url) yield Request( response.url + "&at=" + str(response.meta["attempt"]), meta={"cur": response.meta["cur"], "attempt": response.meta["attempt"] + 1}, dont_filter=True, callback=self.parse_items, ) return soup = BeautifulSoup(response.body) products = [ a["href"] for a in soup.findAll( lambda tag: tag.name == "a" and tag.findChild("b") and tag.findParent("td", {"colspan": 2}) ) ] for url in products: url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product) """trs = hxs.select('//div[@id="mainContent"]//table[@style="height:100%"]/tr')
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) # XPath does not work for some reason soup = BeautifulSoup(response.body) try: name = soup.find(attrs={'itemprop': 'name'}).text except: return loader.add_value( 'identifier', soup.find('div', { 'class': 'clearfix' }).find('a')['title']) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_value( 'price', extract_price( soup.find(attrs={ 'itemprop': 'price' }).text.replace('.', '').replace(',', '.'))) try: loader.add_value('sku', re.search('(\d{4}\d*)', name).groups()[0]) except: self.log('Product without SKU: %s' % (response.url)) loader.add_value('category', 'Lego') img = soup.find(attrs={'itemprop': 'image'}).find('img') if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img['src'])) loader.add_value('brand', 'lego') loader.add_value('shipping_cost', '1.99') # loader.add_xpath('stock', '1') yield loader.load_item()
def parse(self, response): # using beautiful soup since the html is broken and cannot be parsed with lxml soup = BeautifulSoup(response.body) urls = soup.findAll('a', {'class': 'products-list__item'}) for url in urls: yield Request(url['href'], callback=self.parse_product, meta=response.meta)
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # products = hxs.select(u'//div[@class="itemResultsRow"]') try: products = soup.findAll('div', attrs={'class': 'itemResultsRow'}) except AttributeError: products = [] for product in products: # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0] url = product.find('div', attrs={'class': 'itemTitle'}).find('a')['href'] url = urljoin_rfc(get_base_url(response), url) # dropdown = product.select(u'.//select[@name="mv_order_item"]') dropdown = product.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: try: # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip() except AttributeError: brand = u'' # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip() try: # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip() vintage_age = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'vintageAge'}).text.strip() except AttributeError: vintage_age = u'' # multiple_prices = product.select(u'.//td[@class="priceCell"]') multiple_prices = product.findAll('td', attrs={'class':'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()') try: price = option.find('p', attrs={'class': 'priceCellP salePriceP'}).find('span', attrs={'class': 'priceSale'}).text.strip() except AttributeError: price = option.find('p', attrs={'class': 'priceCellP'}).find('span', attrs={'class': 'priceRetail'}).text.strip() # if not price: # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()') # price = price[0].extract().strip() # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract() bottle_size = option.find('p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract() bottle_size = option.find(lambda tag: tag.name == 'span' and tag.get('class', '') == 'priceUnit' and tag.findParent('p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) if loader.get_output_value('price'): yield loader.load_item() else: # dropdown = dropdown[0] # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() brand = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'brand'}).text.strip() title = product.find('div', attrs={'class':'itemTitle'}).find('a').find('span', attrs={'class':'title'}).text.strip() # for option in dropdown.select(u'./option/text()').extract(): for option in [option.text for option in dropdown.findAll('option')]: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) name = u'%s %s' % (brand, title) option = re.search(u'(.*?) \((.*)\)', option).groups() name += u' %s' % option[1] loader.add_value('name', name) loader.add_value('price', option[0]) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) # products = hxs.select(u'//div[@class="itemResultsRow"]') try: products = soup.findAll('div', attrs={'class': 'itemResultsRow'}) except AttributeError: products = [] for product in products: # url = product.select(u'.//div[@class="itemTitle"]/a/@href').extract()[0] url = product.find('div', attrs={ 'class': 'itemTitle' }).find('a')['href'] url = urljoin_rfc(get_base_url(response), url) # dropdown = product.select(u'.//select[@name="mv_order_item"]') dropdown = product.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: try: # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() brand = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'brand' }).text.strip() except AttributeError: brand = u'' # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() title = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'title' }).text.strip() try: # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip() vintage_age = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find( 'span', attrs={ 'class': 'vintageAge' }).text.strip() except AttributeError: vintage_age = u'' # multiple_prices = product.select(u'.//td[@class="priceCell"]') multiple_prices = product.findAll('td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) # price = option.select(u'.//p[@class="priceCellP salePriceP"]/span[@class="priceRetail"]/text()') try: price = option.find('p', attrs={ 'class': 'priceCellP salePriceP' }).find('span', attrs={ 'class': 'priceSale' }).text.strip() except AttributeError: price = option.find('p', attrs={ 'class': 'priceCellP' }).find('span', attrs={ 'class': 'priceRetail' }).text.strip() # if not price: # price = option.select(u'.//p[@class="priceCellP"]/span[@class="priceSale"]/text()') # price = price[0].extract().strip() # bottle_size = option.select(u'.//p[@class="priceCellP priceUnit"]/text()').extract() bottle_size = option.find( 'p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: # bottle_size = option.select(u'.//p[@class="priceCellP"]/span[@class="priceUnit"]/text()').extract() bottle_size = option.find( lambda tag: tag.name == 'span' and tag.get( 'class', '') == 'priceUnit' and tag.findParent( 'p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) if loader.get_output_value('price'): yield loader.load_item() else: # dropdown = dropdown[0] # brand = product.select(u'.//div[@class="itemTitle"]/a/span[@class="brand"]/text()').extract()[0].strip() # title = product.select(u'.//div[@class="itemTitle"]/a/span[@class="title"]/text()').extract()[0].strip() brand = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'brand' }).text.strip() title = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'title' }).text.strip() # for option in dropdown.select(u'./option/text()').extract(): for option in [ option.text for option in dropdown.findAll('option') ]: loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) name = u'%s %s' % (brand, title) option = re.search(u'(.*?) \((.*)\)', option).groups() name += u' %s' % option[1] loader.add_value('name', name) loader.add_value('price', option[0]) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) if response.status == 404 or response.status == 302: search_url = 'http://www.argonautliquor.com/results?term=' + response.url.split('products/')[-1] yield Request(search_url, callback=self.parse_product) return soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) url = response.url image_url = soup.find('meta', attrs={'property': 'og:image'}) image_url = image_url.get('content') if image_url and image_url.get('content') != 'http:' else '' try: brand = soup.find('h1', attrs={'class': 'itemTitle'}).find('span', attrs={'class': 'brand'}).text.strip() except AttributeError: brand = u'' title = soup.find('h1', attrs={'class': 'itemTitle'}).find('span', attrs={'class': 'title'}).text.strip() try: vintage_age = soup.find('h1', attrs={'class': 'itemTitle'}).find('span', attrs={'class': 'vintageAge'}).text.strip() except AttributeError: vintage_age = u'' dropdown = soup.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: multiple_prices = soup.find('div', attrs={'class': 'priceArea'}).findAll('td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) try: price = option.find('p', attrs={'class': 'priceCellP salePriceP'}).find('span', attrs={'class': 'priceSale'}).text.strip() except AttributeError: price = option.find('p', attrs={'class': 'priceCellP'}).find('span', attrs={'class': 'priceRetail'}).text.strip() try: sku = option.find('p', attrs={'class': 'priceCellP itemid'}).text.strip() except AttributeError: try: sku = option.find('p', attrs={'class': 'priceCellP sku'}).text.strip() except AttributeError: try: sku = option.find('p', attrs={'class': 'sku'}).text.strip() except AttributeError: try: sku = option.find('span', attrs={'class': 'sku'}).text.strip() except AttributeError: sku = '' sku = sku.replace('SKU', '').strip() bottle_size = option.find('p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: bottle_size = option.find(lambda tag: tag.name == 'span' and tag.get('class', '') == 'priceUnit' and tag.findParent('p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('identifier', sku) loader.add_value('image_url', image_url) if loader.get_output_value('price'): yield loader.load_item() else: for option in dropdown.findAll('option'): name = u'%s %s %s' % (brand, title, vintage_age) option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$', option.text).groups() price = option[0] name += u' %s' % option[1].strip() sku = option[2].replace('SKU', '').strip() loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('image_url', image_url) loader.add_value('identifier', sku) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) option_label = ' '.join( hxs.select('//div[@class="variationSelected"]' '/*[@class="variationLabel"]/text()').extract()) loader = ProductLoader(item=Product(), selector=hxs) soup = BeautifulSoup(response.body) try: name = ' '.join( [soup.find('span', id='btAsinTitle').text, option_label]).strip() except: name = ' '.join([ hxs.select('//h1[@id="title"]/text()').extract()[0].strip(), option_label ]).strip() loader.add_value('name', name) loader.add_value('url', response.url) no_price_ = False try: soup_form = soup.find(id='handleBuy') price = soup_form.find('b', 'priceLarge') if not price: price = soup_form.find('span', 'price') if not price: price = soup_form.find('span', 'pa_price') if not price: no_price_ = True else: loader.add_value('price', price.text) except: price = hxs.select('//div[@id="price"]//td[text()="Price:"]' '/following-sibling::td/span/text()').extract() if not price: no_price_ = True else: loader.add_value('price', price[0]) if no_price_: self.log('ERROR: no price found! URL:{}'.format(response.url)) return reviews_url = hxs.select( u'//a[contains(text(),"customer review") and contains(@href, "product-reviews") ' u'and not(contains(@href, "create-review"))]/@href').extract() loader.add_value('brand', response.meta['brand'].strip().lower()) sku = hxs.select( '//span[@class="tsLabel" and contains(text(), "Part Number")]/../span[2]/text()' ).extract() if not sku: sku = hxs.select( '//b[contains(text(), "model number")]/../text()').extract() if sku: loader.add_value('sku', sku[0].strip().lower()) else: self.log('ERROR: no SKU found! URL:{}'.format(response.url)) identifier = hxs.select('//form/input[@name="ASIN"]/@value').extract() if not identifier: self.log('ERROR: no identifier found! URL:{}'.format(response.url)) return else: loader.add_value('identifier', identifier) product_image = hxs.select( '//*[@id="main-image" or @id="prodImage"]/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) loader.add_value('image_url', image) category = hxs.select('//*[@id="nav-subnav"]/li[1]/a/text()').extract() if not category: self.log("ERROR: category not found") else: loader.add_value('category', category[0].strip()) product = loader.load_item() if product['identifier'] not in self.ids: self.ids.append(product['identifier']) metadata = KeterMeta() metadata['brand'] = response.meta['brand'].strip().lower() metadata['reviews'] = [] product['metadata'] = metadata if reviews_url: yield Request(urljoin_rfc(base_url, reviews_url[0]), meta={'product': product}, callback=self.parse_review) else: yield product
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) soup = BeautifulSoup(response.body, convertEntities=BeautifulSoup.HTML_ENTITIES) try: products = soup.findAll('div', attrs={'class': 'itemResultsRow'}) except AttributeError: products = [] if not products: single_product = True else: single_product = False for product in products: url = product.find('div', attrs={ 'class': 'itemTitle' }).find('a')['href'] url = urljoin_rfc(get_base_url(response), url) try: brand = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'brand' }).text.strip() except AttributeError: brand = u'' title = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'title' }).text.strip() try: # vintage_age = product.select(u'.//div[@class="itemTitle"]/a/span[@class="vintageAge"]/text()').extract()[0].strip() vintage_age = product.find('div', attrs={ 'class': 'itemTitle' }).find('a').find('span', attrs={ 'class': 'vintageAge' }).text.strip() except AttributeError: vintage_age = u'' dropdown = product.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: multiple_prices = product.findAll('td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) try: price = option.find('p', attrs={ 'class': 'priceCellP salePriceP' }).find('span', attrs={ 'class': 'priceSale' }).text.strip() except AttributeError: price = option.find('p', attrs={ 'class': 'priceCellP' }).find('span', attrs={ 'class': 'priceRetail' }).text.strip() try: sku = option.find('p', attrs={ 'class': 'priceCellP itemid' }).text.strip() except AttributeError: sku = '' bottle_size = option.find( 'p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: bottle_size = option.find( lambda tag: tag.name == 'span' and tag.get( 'class', '') == 'priceUnit' and tag.findParent( 'p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) if loader.get_output_value('price'): yield loader.load_item() else: for option in dropdown.findAll('option'): loader = ProductLoader(item=Product(), response=response) loader.add_value('url', url) name = u'%s %s' % (brand, title) # option = re.search(u'(.*?) \((.*)\)', option.text).groups() option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$', option.text).groups() name += u' %s' % option[1] loader.add_value('name', name) loader.add_value('price', option[0]) loader.add_value('sku', option[2]) if loader.get_output_value('price'): yield loader.load_item() if single_product: url = response.url try: brand = soup.find('div', attrs={ 'class': 'itemTitle' }).find('span', attrs={ 'class': 'brand' }).text.strip() except AttributeError: brand = u'' title = soup.find('div', attrs={ 'class': 'itemTitle' }).find('span', attrs={ 'class': 'title' }).text.strip() try: vintage_age = soup.find('div', attrs={ 'class': 'itemTitle' }).find('span', attrs={ 'class': 'vintageAge' }).text.strip() except AttributeError: vintage_age = u'' dropdown = soup.find('select', attrs={'name': 'mv_order_item'}) if not dropdown: multiple_prices = soup.find('div', attrs={ 'class': 'priceArea' }).findAll( 'td', attrs={'class': 'priceCell'}) for option in multiple_prices: name = u'%s %s %s' % (brand, title, vintage_age) loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) try: price = option.find('p', attrs={ 'class': 'priceCellP salePriceP' }).find('span', attrs={ 'class': 'priceSale' }).text.strip() except AttributeError: price = option.find('p', attrs={ 'class': 'priceCellP' }).find('span', attrs={ 'class': 'priceRetail' }).text.strip() try: sku = option.find('p', attrs={ 'class': 'priceCellP itemid' }).text.strip() except AttributeError: sku = '' bottle_size = option.find( 'p', attrs={'class': 'priceCellP priceUnit'}) if not bottle_size: bottle_size = option.find( lambda tag: tag.name == 'span' and tag.get( 'class', '') == 'priceUnit' and tag.findParent( 'p', attrs={'class': 'priceCellP'})) if bottle_size: name += u' %s' % bottle_size.text.strip() loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) if loader.get_output_value('price'): yield loader.load_item() else: for option in dropdown.findAll('option'): name = u'%s %s %s' % (brand, title, vintage_age) option = re.search(r'(\$[\d\.]*) \(([^)]*)\) (.*)$', option.text).groups() price = option[0] name += u' %s' % option[1].strip() sku = option[2] loader = ProductLoader(item=Product(), selector=option) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('price', price) loader.add_value('sku', sku) if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): try: # fall back to Beautiful Soup soup = BeautifulSoup(response.body) hxs = HtmlXPathSelector(response) container = soup.find('div', attrs={'class': 'nosto_product'}) brand = container.find('span', attrs={'class': 'brand'}).text cat_names = [el.text for el in soup.find("div", id='bct').findAll('a')][1:] main_id = container.find('span', attrs={'class': 'product_id'}).text availability = container.find('span', attrs={'class': 'availability'}).text image_url = soup.find('img', id='main-image').attrMap['src'] options = soup.find('table', id='sku-table') if not options: name = soup.find('div', id='product-page-info').find('h1').text price = container.find('span', attrs={'class': 'price'}).text loader = ProductLoaderWithNameStrip(Product(), selector=hxs) loader.add_value('brand', brand) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('name', name) loader.add_value('identifier', main_id) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('sku', main_id) loader.add_value('image_url', image_url) if availability.lower() == 'outofstock': loader.add_value('stock', 0) yield loader.load_item() else: option_ids = [] for opt in options.findAll('tr'): sec_id = opt.findAll('td')[1].find('small').text name = opt.findAll('td')[1].text.replace(sec_id, '') sec_id = sec_id.strip('(').strip(')') identifier = main_id + ':' + sec_id volts = get_volts_from_name(name) if volts is not None: identifier = identifier + ':' + volts pack_of = get_pack_of_from_name(name) if pack_of is not None: identifier = identifier + ':' + pack_of if identifier in option_ids: option_id = opt.find('input', attrs={'name': 'ID'}).get('value') identifier = identifier + ':' + option_id option_ids.append(identifier) price = opt.find('td', attrs={'class': 'price'}).text.strip(u'\xa3').strip('£') loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('brand', brand) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('name', name) loader.add_value('identifier', identifier) loader.add_value('price', price) loader.add_value('url', response.url) loader.add_value('sku', main_id) loader.add_value('image_url', image_url) if availability.lower() == 'outofstock': loader.add_value('stock', 0) yield loader.load_item() except IndexError as e: # try loading page again tries = response.meta.get('try', 0) if tries < 10: yield Request(response.url, callback=self.parse_product, dont_filter=True, meta={'try': tries + 1}) else: self.errors.append("Error scraping page %s: %s" % (response.url, str(e))) raise
def parse_product(self, response): soup = BeautifulSoup(response.body) if not soup.find('div', attrs={'class': 'product'}): retry_request = _retry_page(response) if retry_request: yield retry_request else: self.log( "Error parsing page, couldn't extract product name: %s" % response.url) return main_name = soup.find('div', attrs={'class': 'product'}).h1.text main_name = remove_entities(main_name) brand_el = soup.find( lambda tag: tag.name == 'td' and 'brand' in tag.text.lower()) brand = brand_el.findNextSibling('td').text.strip() if brand_el else '' cat_names = [ span.a.text for span in soup.find('div', attrs={ 'class': 'breadcrumbtrail' }).span.findAll('span') if span.a ][2:] image_url = soup.find('img', {'itemprop': 'image'}) image_url = image_url['src'] if image_url else None table = soup.find('table', id='responsive-table') options = soup.findAll('div', attrs={'class': 'option'}) if table: for row in table.findAll('tr'): # Skip head row if not row.td: continue name = row.find('span', attrs={'class': 'name'}).text name = remove_entities(name) if not _main_name_in_opt_name(main_name, name): name = main_name + ' ' + name identifier = row.find('span', attrs={'class': 'codenumber'}) if not identifier: self.errors.append( "Identifier not found for products on page: %s" % response.url) continue identifier = identifier.text price = row.find(_is_price_tag).text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item() elif options: main_id = response.url.split('.')[-2].split('p-')[-1] price = soup.find('span', attrs={'class': 'inctax'}).span.text real_price = extract_price(price) if real_price < 15: shipping_cost = 3 elif real_price < 40: shipping_cost = 4 elif real_price < 130: shipping_cost = 7 else: shipping_cost = None results = {} for opt in options: opt_name = opt.label.span.text results[opt_name] = [] for subopt in opt.select.findAll('option'): subopt_name = subopt.text subopt_value = _soup_el_get_attr(subopt, 'value') if subopt_value == '0': continue results[opt_name].append({ 'id': remove_entities(subopt_name).replace('"', ''), 'name': opt_name + ': ' + subopt_name }) for opt_tuple in product(*results.values()): name = _build_opt_name(main_name, opt_tuple) identifier = _build_opt_id(main_id, opt_tuple) loader = ProductLoaderWithNameStrip(Product(), response=response) loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', brand) loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('price', price) for cat_name in cat_names: loader.add_value('category', cat_name) loader.add_value('shipping_cost', shipping_cost) loader.add_value('image_url', image_url) yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(text=response.body_as_unicode()) loader = ProductLoader(response=response, item=Product()) loader.add_value('url', response.url) identifier = hxs.select('//input[@id="catentryId"]/@value').extract() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') price = ''.join( hxs.select('//div[@itemprop="price"]//span[@class="price"]//text()' ).extract()).strip() loader.add_value('price', price) categories = hxs.select( '//ul[@class="breadcrumbs"]//li[not(@class="home")]/a/span/text()' ).extract()[1:] loader.add_value('category', categories) image_url = hxs.select('//img[@id="productMainImage"]/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(get_base_url(response), image_url[0])) brand = hxs.select( '//li[contains(text(), "BRAND")]/span/text()').extract() loader.add_value('brand', brand) item = loader.load_item() if not item.get('name'): log.msg('Using BeautifulSoup: ' + response.url) loader = ProductLoader(response=response, item=Product()) soup = BeautifulSoup(response.body) loader.add_value('url', response.url) identifier = soup.find('input', attrs={'id': 'catentryId'}) identifier = _soup_el_get_attr(identifier, 'value') loader.add_value('identifier', identifier) loader.add_value('sku', identifier) name = soup.find('h1', attrs={'itemprop': 'name'}).text loader.add_value('name', name) categories = [ li.a.span.text for li in soup.find('ul', attrs={ 'class': 'breadcrumbs' }).findAll('li') if li.a ][2:] loader.add_value('category', categories) price = soup.find('div', attrs={ 'itemprop': 'price' }).find('span', attrs={ 'class': 'price' }).text loader.add_value('price', price) image_url = soup.find('img', attrs={'id': 'productMainImage'}) if image_url: image_url = _soup_el_get_attr(image_url, 'src') loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url)) brand = '' for li in soup.findAll('li'): if 'BRAND' in li.text.upper(): brand = li.span.text break loader.add_value('brand', brand) item = loader.load_item() if item['identifier']: yield item else: if item['identifier']: yield item if not item.get('name'): request = self.retry(response, "No name for product: " + response.url) if request: yield request return