def get_asins_generator(self): with open(self.file_path) as f: reader = csv.DictReader(f) for row in reader: if row['AMAZON'].strip(): yield (AmazonUrlCreator.get_product_asin_from_url( row['AMAZON']), row['PRODUCT_NUMBER'])
def __parse_product(self, response): """ Parse product just to get seller name """ if self.scraper.antibot_protection_raised(response.body_as_unicode()): if self.do_retry: yield self.retry_download(url=response.url, metadata=response.meta, callback=self.parse_product) else: self.log('[AMAZON] WARNING: Amazon antibot protection detected, consider using proxy/tor, url: [{}]'.format( response.url)) product_info = self.scraper.scrape_product_details_page(response) if not product_info: self.log("[AMAZON] WARNING: no product info: %s" % response.url) return # Fix self.current_search_item and meta['search_item']. Needed for BSM spider hxs = HtmlXPathSelector(response) out_of_stock = hxs.select('//div[@class="availRed"]').extract() not_available = hxs.select('//span[@class="availOrange"]').extract() if out_of_stock or not_available: product_info['stock'] = '0' sku = hxs.select('//tr[td[text()="Modellnummer"]]/td[@class="value"]/text()').extract() if sku: product_info['sku'] = sku[0] else: product_info['sku'] = ''.join(hxs.select('//li[b/text()="Modellnummer:"]/text()').extract()).strip() if not product_info['sku']: product_info['sku'] = ''.join(hxs.select('//li[contains(b/text(), "Herstellerreferenz")]/text()').extract()).strip() if not product_info['sku']: product_info['sku'] = ''.join(hxs.select('//tr[contains(td/text(), "Herstellerreferenz")]/td[@class="value"]/text()').extract()).strip() if not product_info['price']: product_info['price'] = ''.join(hxs.select('//td/b[@class="priceLarge"]/text()').extract()) if not product_info['price']: product_info['price'] = ''.join(hxs.select('//span[@id="priceblock_ourprice"]/text()').extract()) product_info['dealer'] = ''.join(hxs.select('//div[@class="buying"]/b/a[contains(@href, "seller")]/text()').extract()) if not product_info['dealer']: product_info['dealer'] = ''.join(hxs.select('//div[@id="merchant-info"]/a[contains(@href, "seller")]/text()').extract()) if not response.meta.get('search_item'): response.meta['search_item'] = product_info if not self.current_search_item: self.current_search_item = product_info if response.meta.get('seller_identifier', None) and not product_info.get('seller_identifier', None): product_info['seller_identifier'] = response.meta['seller_identifier'] check_match = response.meta.get('check_match', True) if self.type == 'search': match = self.match(response.meta, self.current_search_item, product_info) elif self.type == 'category': match = True elif self.type == 'asins': match = True else: raise CloseSpider("Wrong spider type: %s" % self.type) if check_match and not match: self.log("[AMAZON] WARNING: product does not match: %s" % response.url) return if self.parse_options: if product_info['options'] and response.meta.get('parse_options', True): self.log('[AMAZON] OPTIONS FOUND => %s' % response.url) for option in product_info['options']: new_meta = response.meta.copy() new_meta.update({ 'parse_options': False, 'search_string': self.current_search, 'search_item': self.current_search_item, 'check_match': check_match }) yield Request( option['url'], self.parse_product, meta=new_meta ) return else: if product_info['option_texts']: product_info['name'] += ' [' + ', '.join(product_info['option_texts']) + ']' if self.type == 'asins': url_asin = AmazonUrlCreator.get_product_asin_from_url(product_info['url']) if product_info['asin'].lower() != url_asin.lower(): self.log("[AMAZON] product ASIN '%s' does not match url ASIN '%s'. Page: %s" % (product_info['asin'], url_asin, response.url)) return # Amazon Direct if self.amazon_direct: if self.collect_reviews and product_info.get('reviews_url'): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request( product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta ) else: product = self.construct_product(product_info, meta=response.meta) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect_amazon_direct(product, response.meta) # Buy Box elif self.only_buybox: if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \ self.collect_products_with_no_dealer: if self.collect_reviews and product_info.get('reviews_url'): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request( product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta ) else: product = self.construct_product(product_info, meta=response.meta) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect_buybox(product, response.meta) elif not product_info['vendor']: # TODO: collect vendor from vendor details page self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url) else: self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url) # all sellers / lowest price elif self.all_sellers or self.lowest_product_and_seller: # Go to MBC lists to get dealers prices collect_mbc = response.meta.get('collect_mbc', True) if collect_mbc and product_info.get('mbc_list_url_new') and self.collect_new_products: # yield mbc parse new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request( product_info['mbc_list_url_new'], callback=self.parse_mbc_list, meta=new_meta ) elif collect_mbc and product_info.get('mbc_list_url_used') and self.collect_used_products: # yield mbc parse new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request( product_info['mbc_list_url_used'], callback=self.parse_mbc_list, meta=new_meta ) else: if (product_info['vendor'] and self._seller_ok(product_info['vendor'])) or \ self.collect_products_with_no_dealer: if self.collect_reviews and product_info.get('reviews_url'): new_meta = response.meta.copy() new_meta['found_item'] = product_info if self.type == 'search': new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, }) yield Request( product_info['reviews_url'], callback=self.parse_reviews, meta=new_meta ) else: product = self.construct_product(product_info, meta=response.meta) self.log("[AMAZON] collect parse product: %s" % product['identifier']) if self.type == 'category': yield product else: self._collect(product) elif not product_info['vendor']: # TODO: collect vendor from vendor details page self.log("[AMAZON] WARNING: product with no vendor: %s" % response.url) else: self.log("[AMAZON] WARNING: vendor not allowed: %s" % response.url)
def __parse_product_list(self, response): """ This function is callback for Scrapy. It processes search results page TODO: incorporate cache """ if self.scraper.antibot_protection_raised(response.body_as_unicode()): if self.do_retry: yield self.retry_download(url=response.url, metadata=response.meta, callback=self.parse_product_list) else: self.log('[AMAZON] WARNING: Amazon antibot protection detected, consider using proxy/tor, url: %s' % response.url) follow_suggestions = response.meta.get("follow_suggestions", True) is_main_search = response.meta.get("is_main_search", True) data = self.scraper.scrape_search_results_page(response) if not self.check_number_of_results_fits(data): requests = self.get_subrequests_for_search_results(response, data) self.log("[AMAZON] WARNING: Number of results is too big (%d). Splitting to %d requests. URL: %s" % (data['results_count'], len(requests), response.url)) for req in requests: yield req return if data['products']: items = data['products'] found_for = None if self.type == 'search': found_for = self.current_search elif self.type == 'category': found_for = self.current_category self.log('[AMAZON] Found products for [%s]' % found_for) elif data['suggested_products'] and self.try_suggested: items = data['suggested_products'] self.log('[AMAZON] No products found for [%s]. Using suggested products. URL: %s' % (self.current_search, response.url)) else: items = [] if not items and not response.meta.get('ean_search', False): search_string = self.ean_list[self.current_search.replace('"', '')] url = AmazonUrlCreator.build_search_url(self.domain, search_string, self.amazon_direct) s_item = { 'sku': '', 'brand': '', 'name': '', 'category': '', 'price': 0, } yield Request(url, meta={'search_string': search_string, 'search_item': s_item, 'ean_search':True}, dont_filter=True, callback=self.parse_product_list) if not data['products'] and follow_suggestions and self.try_suggested: self.log('[AMAZON] No products or suggested products found for [%s], trying suggested searches' % self.current_search) for url in data['suggested_search_urls']: # yield request, should mark that it's referred as suggested search and as so do not check other suggestions new_meta = response.meta.copy() new_meta.update({ 'search_string': response.meta['search_string'], 'search_item': self.current_search_item, 'follow_suggestions': False, 'is_main_search': False, }) yield Request( url, meta=new_meta, dont_filter=True, callback=self.parse_product_list ) matched_any = False # Amazon Direct if self.amazon_direct and not self.only_buybox and not self.all_sellers and not self.lowest_product_and_seller: for item in items: results = list(self._process_product_list_item_amazon_direct(response, item)) matched_any = results[-1] for req in results[:-1]: yield req # Buy-Box elif self.only_buybox and not self.amazon_direct and not self.all_sellers and not self.lowest_product_and_seller: for item in items: results = list(self._process_product_list_item_buybox(response, item)) matched_any = results[-1] for req in results[:-1]: yield req # All sellers / lowest price dealer elif self.all_sellers or self.lowest_product_and_seller: for item in items: results = list(self._process_product_list_item_all_sellers(response, item)) matched_any = results[-1] for req in results[:-1]: yield req next_url = data['next_url'] follow_next = False if self.type == 'category': follow_next = True elif self.type == 'search': # Follow to next pages only for original search # and suggested search if at least one product matched from first page # otherwise it tries to crawl the whole Amazon or something like that follow_next = (is_main_search or matched_any) if next_url and follow_next: page = data.get('current_page', 1) page = int(page) if page is not None else 1 if self.max_pages is None or page <= self.max_pages: new_meta = response.meta.copy() new_meta.update({ 'follow_suggestions': False, 'is_main_search': is_main_search, 'current_page': page + 1 }) yield Request( next_url, meta=new_meta, dont_filter=True, callback=self.parse_product_list ) else: self.log('[AMAZON] Max page limit %d reached. URL: %s' % (self.max_pages, response.url)) elif next_url: self.log('[AMAZON] Not following next page from %s: %s' % (response.url, next_url)) else: self.log('[AMAZON] No next url from %s' % response.url)
def get_url_from_asin(self, asin): return AmazonUrlCreator.build_url_from_asin( self.domain, asin, )
item['price'] = item['price'][0] price = extract_price2uk(item['price']) if not isinstance( item['price'], Decimal) else item['price'] except Exception, e: self.log('ERROR: extracting price => PRICE: %s' % repr(item['price'])) raise e else: price = Decimal('0') price = self.transform_price(price) loader.add_value('price', price) if item.get('asin') and item.get('seller_identifier'): loader.add_value( 'url', AmazonUrlCreator.build_url_from_asin_and_dealer_id( self.domain, item['asin'], item['seller_identifier'])) elif item.get('asin'): loader.add_value( 'url', AmazonUrlCreator.build_url_from_asin(self.domain, item['asin'])) elif self.use_amazon_identifier: loader.add_value( 'url', AmazonUrlCreator.build_url_from_asin(self.domain, item['identifier'])) elif item.get('url'): loader.add_value('url', item['url']) # take sku from model if configured to do so if item.get('model') and self.model_as_sku:
def scrape_mbc_list_page(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) try: url = hxs.select('//a[@id="olpDetailPageLink"]/@href').extract()[0] url = urljoin(base_url, url) url_parts = url.split('/') try: asin = url_parts[url_parts.index('product') + 1] except ValueError: asin = url_parts[url_parts.index('dp') + 1] except IndexError: return None products = [] for i, result in enumerate( hxs.select( '//div[@id="olpOfferList"]//div[contains(@class, "olpOffer")]' ), 1): product = {} name = ' '.join( hxs.select(u'//div[@id="olpProductDetails"]/h1//text()'). extract()).strip() product['name'] = AmazonFilter.filter_name(name) brand = hxs.select( u'//div[@id="olpProductByline"]/text()').extract() if brand: product['brand'] = AmazonFilter.filter_brand(brand[0]) price_el = result.select( './/span[contains(@class, "olpOfferPrice")]/text()') if not price_el: # check if there is text "Add to basket to check price" price_text = result.select( './/div[p[contains(@class, "olpShippingInfo")]]/text()' ).extract()[0].strip() if 'basket' in price_text.lower(): product['price'] = None else: raise AmazonScraperException( "Couldn't extract price from element %d from url %s" % (i, response.url)) else: price = price_el.extract()[0].strip() product['price'] = self._extract_price(response.url, price) seller_id = None seller_urls = result.select( u'.//*[contains(@class, "olpSellerName")]//a/@href').extract() if seller_urls: seller_url_ = seller_urls[0] if 'seller=' in seller_url_: seller_id = url_query_parameter(seller_url_, 'seller') else: seller_parts = seller_url_.split('/') try: seller_id = seller_parts[seller_parts.index('shops') + 1] except (IndexError, KeyError, ValueError): # External website (link "Shop this website"?) seller_id = url_query_parameter( seller_url_, 'merchantID') product['identifier'] = asin product['asin'] = asin if seller_id: product['seller_identifier'] = seller_id product[ 'url'] = AmazonUrlCreator.build_url_from_asin_and_dealer_id( AmazonUrlCreator.get_domain_from_url(response.url), asin, seller_id) product['seller_url'] = AmazonUrlCreator.build_vendor_url( AmazonUrlCreator.get_domain_from_url(response.url), seller_id) # product['url'] = 'http://%s/gp/product/%s/?m=%s' % (self._get_domain_from_url(response.url), product_id, seller_id) else: product['seller_identifier'] = None product['url'] = AmazonUrlCreator.build_url_from_asin( AmazonUrlCreator.get_domain_from_url(response.url), asin) product['seller_url'] = None # product['url'] = 'http://%s/gp/product/%s/' % (self._get_domain_from_url(response.url), product_id) shipping = result.select( './/span[@class="olpShippingPrice"]/text()').extract() if shipping: product['shipping_cost'] = shipping[0] image_url = hxs.select( u'//div[@id="olpProductImage"]//img/@src').extract() if image_url: product['image_url'] = urljoin(base_url, image_url[0]) vendor = result.select( u'.//div[contains(@class, "olpSellerColumn")]//img/@title' ).extract() if not vendor: vendor = result.select( u'.//div[contains(@class, "olpSellerColumn")]//img/@alt' ).extract() if not vendor: vendor = result.select( u'.//*[contains(@class, "olpSellerName")]//a/b/text()' ).extract() if not vendor: vendor = result.select( u'.//*[contains(@class, "olpSellerName")]//span/a/text()' ).extract() if vendor: vendor = vendor[0] if vendor.lower().startswith('amazon'): vendor = 'Amazon' else: vendor = 'AM - ' + vendor product['vendor'] = vendor elif not seller_id: product['vendor'] = 'Amazon' else: product['vendor'] = None stock = result.select( './/div[contains(@class,"olpDeliveryColumn")]//text()').re( 'En Stock|En stock') if stock: product['unavailable'] = False products.append(product) next_url = hxs.select( '//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href').extract( ) next_url = urljoin(base_url, next_url[0]) if next_url else None current_page = hxs.select( '//ul[@class="a-pagination"]/li[@class="a-selected"]/a/text()' ).extract() current_page = current_page[0] if current_page else None return { 'next_url': next_url, 'current_page': current_page, 'products': products }