Exemplo n.º 1
0
    def __init__(self, captcha_retries='10', *args, **kwargs):
        super(AmazonProductsSpider, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)

        self.mtp_class = Amazon_marketplace(self)

        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 2
0
    def __init__(self, outfile=None, test_category=None):
        self.outfile = outfile

        # if this is set, only crawl this category (level 2/1 category name). used for testing
        self.test_category = test_category

        # if test category is set and no output file was specified, set the name of outfile to a special "test" name
        if self.test_category and not self.outfile:
            self.outfile = "amazon_categories_test.jl"

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 2

        # hardcoded toplevel categories (level 1 and 2) urls to replace/supplement some of the ones found on the sitemap above (point to the same category, but have different page content. they were found manually)
        # reason: they provide more info regarding product count than the ones found on the sitemap
        # keys are categories names as found in the sitemap, values are URLs associated with them, that will replace/supplement the links found on the sitemap
        self.EXTRA_TOPLEVEL_CATEGORIES_URLS = {
                                    "Baby" : "http://www.amazon.com/s/ref=lp_166835011_ex_n_1?rh=n%3A165796011&bbn=165796011&ie=UTF8&qid=1393338541", \
                                    "Electronics & Computers" : "http://www.amazon.com/s/ref=lp_172659_ex_n_1?rh=n%3A172282&bbn=172282&ie=UTF8&qid=1393338741", \
                                    "Home, Garden & Tools" : "http://www.amazon.com/s/ref=lp_284507_ex_n_1?rh=n%3A1055398&bbn=1055398&ie=UTF8&qid=1393338782",\
                                    "Kindle E-readers & Books" : "http://www.amazon.com/s/ref=lp_154606011_ex_n_1?rh=n%3A133140011&bbn=133140011&ie=UTF8&qid=1395704970", \
                                    "Apps & Games" : "http://www.amazon.com/b/ref=sd_allcat_fire_apps_games?ie=UTF8&node=3427287011", \
                                    "Movies & TV" : "http://www.amazon.com/action-adventure-dvd-bluray/b/ref=MoviesHPBB_Genres_Action?ie=UTF8&node=2650363011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-left-2&pf_rd_r=0GAWFEZ3EXP8PEYCM6X3&pf_rd_t=101&pf_rd_p=1753817742&pf_rd_i=2625373011", \
                                    "All Beauty" : "http://www.amazon.com/s/ref=lp_11059031_ex_n_1?rh=n%3A3760911&bbn=3760911&ie=UTF8&qid=1395793680",\
                                    "Health, Household & Baby Care" : "http://www.amazon.com/s/ref=lp_6183682011_ex_n_1?rh=n%3A3760901&bbn=3760901&ie=UTF8&qid=1395822180", \
                                    "Tires & Wheels" : "http://www.amazon.com/s/ref=lp_353609011_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A15706571&bbn=15706571&ie=UTF8&qid=1395824546", \
                                    "Motorcycle & Powersports" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A346333011&bbn=346333011&ie=UTF8&qid=1395824599", \
                                    "Automotive & Industrial" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181&bbn=15684181" # this is partial - "Automotive and industrial" also contains the "Industrial & Scientific" cats which can be found in the sitemap
                                    }

        # flag indicating whether to compute overall product counts in pipelines phase for this spider.
        # if on, 'catid' and 'parent_catid' fields need to be implemented
        self.compute_nrproducts = True

        # counter for department id, will be used to autoincrement department id
        self.department_count = 0
        # counter for category id
        self.catid = 0

        # level to stop crawling (don't extract subcategories below this level)
        self.LEVEL_BARRIER = -2

        # maximum number of retries when presented with captcha form
        self.MAX_CAPTCHA_RETRY = 10

        # dictionarties associating department names with other attributes - to use for setting parent category info for level 1 categories
        # associates department names with their ids
        self.departments_ids = {}
        # associates department names with their urls (will be available only for extra_categories)
        self.department_urls = {}
        # associate department names with their category ids
        self.departments_cat_ids = {}

        # captcha breaker
        self.CB = CaptchaBreakerWrapper()
Exemplo n.º 3
0
    def __init__(self,
                 captcha_retries='10',
                 *args, **kwargs):

        super(AmazonBaseClass, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 4
0
    def __init__(self,
                 limit='100',
                 service_url=None,
                 captcha_retries='10',
                 *args,
                 **kwargs):
        super(UrlServiceSpider, self).__init__(*args, **kwargs)

        if service_url is None:
            raise AssertionError("Service URL is not optional.")

        self.limit = limit
        self.captcha_retries = int(captcha_retries)
        self.service_url = service_url

        self._cbw = CaptchaBreakerWrapper()

        queue_url = urlparse.urljoin(
            self.service_url, 'get_queued_urls/?limit=%d&block=%d') \
            % (int(limit), 0)
        self.log("Fetching URLs with '%s'." % queue_url, level=DEBUG)
        self.start_urls.append(queue_url)
Exemplo n.º 5
0
    def __init__(self,
                 url_formatter=None,
                 client_url=None,
                 file_name=None,
                 product_asins=None,
                 captcha_retries='10',
                 *args, **kwargs):

        self.SEARCH_URL = client_url
        super(AmazonSpider, self).__init__(*args, **kwargs)

        if file_name:
            self.file_name = file_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        product_asins = json.loads(product_asins)
        self.product_asins = product_asins['asins']

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 6
0
    def __init__(self, *args, **kwargs):
        # For some reason amazon fail to scrape most data
        # when you turn off variants
        self.ignore_variant_data = False
        self.product_url = kwargs['product_url']

        # See https://bugzilla.contentanalyticsinc.com/show_bug.cgi?id=3313#c0
        self.num_pages = int(kwargs.get('num_pages', 1))

        # # variants are switched off by default, see Bugzilla 3982#c11
        # self.scrape_variants_with_extra_requests = False
        # if 'scrape_variants_with_extra_requests' in kwargs:
        #     scrape_variants_with_extra_requests = \
        #         kwargs['scrape_variants_with_extra_requests']
        #     if scrape_variants_with_extra_requests in \
        #             (1, '1', 'true', 'True', True):
        #         self.scrape_variants_with_extra_requests = True

        # Default price currency
        self.price_currency = 'USD'
        self.price_currency_view = '$'

        # Locale
        self.locale = 'en-US'

        self.mtp_class = Amazon_marketplace(self)
        self._cbw = CaptchaBreakerWrapper()

        # #backup when total matches cannot be scraped
        # self.total_items_scraped = 0
        # # self.ranking_override = 0
        self.total_matches_re = r'of\s([\d\,]+)\s'
        super(AmazonShelfPagesSpider, self).__init__(*args, **kwargs)
        self._setup_class_compatibility()
        # self.remaining = self.quantity

        # For goldbox deals
        self.deal_response_json_list = []
        self.deal_product_url_list = []
        self.sorted_goldbox_deals_ids = []
Exemplo n.º 7
0
class AmazonSpider(Spider):
    name = 'amazon'
    allowed_domains = ["amazon.com"]
    start_urls = []
    handle_httpstatus_list = [404]
    MAX_RETRIES = 3

    user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko'
                  '/20100101 Firefox/35.0')

    page = 1

    def __init__(self,
                 url_formatter=None,
                 client_url=None,
                 file_name=None,
                 product_asins=None,
                 captcha_retries='10',
                 *args, **kwargs):

        self.SEARCH_URL = client_url
        super(AmazonSpider, self).__init__(*args, **kwargs)

        if file_name:
            self.file_name = file_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        product_asins = json.loads(product_asins)
        self.product_asins = product_asins['asins']

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()

    def make_requests_from_url(self, _):
        """This method does not apply to this type of spider so it is overriden
        and "disabled" by making it raise an exception unconditionally.
        """
        raise AssertionError("Need a search term.")

    def start_requests(self):
        """Generate Requests from the SEARCH_URL and the search terms."""
        meta = {'asins': self.product_asins}
        yield Request(self.SEARCH_URL,
                      meta=meta)

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = self.parse_without_captcha(response)
        return result

    def parse_without_captcha(self, response):
        item = AmazonspiderItem()
        if response.status == 404:
            item['error_message'] = '404 Invalid URL'
            return item
        item['client_url'] = response.url
        meta = response.meta.copy()
        meta['item'] = item
        reviews_url = response.xpath(
            '//a[@class="a-link-normal"]/span[contains(text(),'
            ' "Reviews")]/../@href[contains(.,"member-reviews")]'
        ).extract()

        if reviews_url:
            reviews_url = 'http://www.amazon.com' + reviews_url[0]

            return Request(reviews_url, meta=meta,
                           callback=self.parse_reviews)
        else:
            item['error_message'] = 'Amazon blocked, try again'
            return item

    def parse_reviews(self, response):
        print 'PARSE REVIEWS'
        products_asins = response.meta.get('asins')
        item = response.meta.get('item')

        review_asins = response.xpath(
            '//table[@class="small"]/tr/td/b/a/@href').re('dp/(.*)/ref')
        find_asins = []
        for asin in review_asins:
            if asin in products_asins:
                find_asins.append(asin)

        if 'asins' not in item.keys():
            item['asins'] = find_asins
        else:
            item['asins'].extend(find_asins)

        self.page += 1
        next_page_url = response.xpath(
            '//td[@class="small"]/b/a[contains(@href,"page=' +str(self.page)
            + '")]/@href').extract()
        if next_page_url:
            next_page_url = 'http://www.amazon.com' + next_page_url[0]

            meta = response.meta.copy()
            meta['item'] = item
            yield Request(next_page_url, meta=meta,
                          callback=self.parse_reviews)
        else:
            yield item

    # Captcha handling functions.
    def _has_captcha(self, response):
        return '.images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)."
                 % (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log(
                "Failed to guess captcha for '%s' (try: %d)." % (
                    url, captcha_solve_try),
                level=ERROR
            )
            result = None
        else:
            self.log(
                "On try %d, submitting captcha '%s' for '%s'." % (
                    captcha_solve_try, captcha, url),
                level=INFO
            )
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result
Exemplo n.º 8
0
    def __init__(self, captcha_retries='10', *args, **kwargs):
        super(AmazonProductsSpider, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)

        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 9
0
class AmazonProductsSpider(BaseProductsSpider):
    name = 'amazon_products'
    allowed_domains = ["amazon.com"]

    SEARCH_URL = "http://www.amazon.com/s/?field-keywords={search_term}"

    def __init__(self, captcha_retries='10', *args, **kwargs):
        super(AmazonProductsSpider, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)

        self._cbw = CaptchaBreakerWrapper()

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = super(AmazonProductsSpider, self).parse(response)
        return result

    def parse_product(self, response):
        prod = response.meta['product']

        if not self._has_captcha(response):
            self._populate_from_js(response, prod)

            self._populate_from_html(response, prod)

            cond_set_value(prod, 'locale', 'en-US')  # Default locale.

            result = prod
        elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries:
            self.log("Giving up on trying to solve the captcha challenge after"
                     " %s tries for: %s" % (self.captcha_retries, prod['url']),
                     level=WARNING)
            result = None
        else:
            result = self._handle_captcha(response, self.parse_product)
        return result

    def _populate_from_html(self, response, product):
        cond_set(product, 'brand', response.css('#brand ::text').extract())
        cond_set(
            product,
            'price',
            response.css('#priceblock_ourprice ::text').extract(),
        )
        cond_set(
            product,
            'description',
            response.css('.productDescriptionWrapper').extract(),
        )
        cond_set(
            product,
            'image_url',
            response.css(
                '#imgTagWrapperId > img ::attr(data-old-hires)').extract()
        )
        cond_set(
            product, 'title', response.css('#productTitle ::text').extract())

        # Some data is in a list (ul element).
        model = None
        for li in response.css('td.bucket > .content > ul > li'):
            raw_keys = li.xpath('b/text()').extract()
            if not raw_keys:
                # This is something else, ignore.
                continue

            key = raw_keys[0].strip(' :').upper()
            if key == 'UPC':
                # Some products have several UPCs. The first one is used.
                raw_upc = li.xpath('text()').extract()[0]
                cond_set(
                    product,
                    'upc',
                    raw_upc.strip().split(' '),
                    conv=int
                )
            elif key == 'ASIN' and model is None or key == 'ITEM MODEL NUMBER':
                model = li.xpath('text()').extract()
        cond_set(product, 'model', model, conv=string.strip)

    def _populate_from_js(self, response, product):
        # Images are not always on the same spot...
        img_jsons = response.css(
            '#landingImage ::attr(data-a-dynamic-image)').extract()
        if img_jsons:
            img_data = json.loads(img_jsons[0])
            cond_set_value(
                product,
                'image_url',
                max(img_data.items(), key=lambda (_, size): size[0]),
                conv=lambda (url, _): url)

    def _scrape_total_matches(self, response):
        # Where this value appears is a little weird and changes a bit so we
        # need two alternatives to capture it consistently.

        if response.css('#noResultsTitle'):
            return 0

        # The first possible place is where it normally is in a fully rendered
        # page.
        values = response.css('#resultCount > span ::text').re(
            '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults')
        if not values:
            # Otherwise, it appears within a comment.
            values = response.css(
                '#result-count-only-next'
            ).xpath(
                'comment()'
            ).re(
                '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+'
            )

        if values:
            total_matches = int(values[0].replace(',', ''))
        else:
            self.log(
                "Failed to parse total number of matches for: %s"
                % response.url,
                level=ERROR
            )
            total_matches = None
        return total_matches

    def _scrape_product_links(self, response):
        links = response.css('.prod > h3 > a ::attr(href)').extract()
        if not links:
            self.log("Found no product links.", WARNING)
        for link in links:
            yield link, SiteProductItem()

    def _scrape_next_results_page_link(self, response):
        next_pages = response.css('#pagnNextLink ::attr(href)').extract()
        next_page_url = None
        if len(next_pages) == 1:
            next_page_url = next_pages[0]
        elif len(next_pages) > 1:
            self.log("Found more than one 'next page' link.", ERROR)
        return next_page_url

    ## Captcha handling functions.

    def _has_captcha(self, response):
        return '.images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        # FIXME This is untested and wrong.
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        product = response.meta['product']

        self.log("Captcha challenge for %s (try %d)."
                 % (product['url'], captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log(
                "Failed to guess captcha for '%s' (try: %d)." % (
                    product['url'], captcha_solve_try),
                level=ERROR
            )
            result = None
        else:
            self.log(
                "On try %d, submitting captcha '%s' for '%s'." % (
                    captcha_solve_try, captcha, product['url']),
                level=INFO
            )
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback)
            result.meta['captcha_solve_try'] = captcha_solve_try + 1
            result.meta['product'] = product

        return result
Exemplo n.º 10
0
    def __init__(self,
                 url_formatter=None,
                 quantity=None,
                 page=None,
                 searchterms_str='laptop',
                 searchterms_fn=None,
                 site_name=None,
                 product_url=None,
                 user_agent=None,
                 captcha_retries='10',
                 *args,
                 **kwargs):
        if user_agent is None or user_agent not in self.USER_AGENTS.keys():
            self.log(
                "Not available user agent type or it wasn't set."
                " Default user agent will be used.", INFO)
            user_agent = 'default'

        if user_agent:
            self.user_agent = self.USER_AGENTS[user_agent]
            self.user_agent_key = user_agent

        super(AmazonSpider, self).__init__(*args, **kwargs)

        if site_name is None:
            assert len(self.allowed_domains) == 1, \
                "A single allowed domain is required to auto-detect site name."
            self.site_name = self.allowed_domains[0]
        else:
            self.site_name = site_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        if quantity is None:
            self.log("No quantity specified. Will retrieve all products.",
                     INFO)
            import sys
            self.quantity = sys.maxint
        else:
            self.quantity = int(quantity)

        if page is None:
            self.log("No page specified. Will retrieve all products.", INFO)
            import sys
            self.page = sys.maxint
        else:
            self.page = int(page)

        self.product_url = product_url

        self.searchterms = []
        if searchterms_str is not None:
            self.searchterms = searchterms_str.decode('utf-8').split(',')
        elif searchterms_fn is not None:
            with open(searchterms_fn, encoding='utf-8') as f:
                self.searchterms = f.readlines()
        else:
            self.log("No search terms provided!", ERROR)

        self.log(
            "Created for %s with %d search terms." %
            (self.site_name, len(self.searchterms)), INFO)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 11
0
class UrlServiceSpider(Spider):

    name = "url_service"
    allowed_domains = []
    start_urls = []

    def __init__(self,
                 limit='100',
                 service_url=None,
                 captcha_retries='10',
                 *args,
                 **kwargs):
        super(UrlServiceSpider, self).__init__(*args, **kwargs)

        if service_url is None:
            raise AssertionError("Service URL is not optional.")

        self.limit = limit
        self.captcha_retries = int(captcha_retries)
        self.service_url = service_url

        self._cbw = CaptchaBreakerWrapper()

        queue_url = urlparse.urljoin(
            self.service_url, 'get_queued_urls/?limit=%d&block=%d') \
            % (int(limit), 0)
        self.log("Fetching URLs with '%s'." % queue_url, level=DEBUG)
        self.start_urls.append(queue_url)

    def parse(self, response):
        for crawl_data in json.loads(response.body):
            self.log("From URL Service: %s" % crawl_data, DEBUG)
            url = crawl_data['url']

            req = Request(url,
                          callback=self.parse_target,
                          errback=self.parse_target_err)
            req.meta['crawl_data'] = crawl_data
            req.meta['start_time'] = time.clock()
            yield req

    def parse_target(self, response):
        if not self._has_captcha(response.body):
            result = self._parse_target(response)
        elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries:
            # We already tried to solve the captcha, give up.

            result = RequestErrorItem(base_url=self.service_url,
                                      id=response.meta['crawl_data']['id'],
                                      http_code=response.status,
                                      error_string="Failed to solve captcha.")
        else:
            result = self._handle_captcha(response)
        return result

    def _parse_target(self, response):
        crawl_data = response.meta['crawl_data']

        body = None
        if hasattr(response, 'body_as_unicode'):
            body = response.body_as_unicode().encode('utf-8')
        else:
            body = response.body  # Probably binary or incorrect Content-Type.

        item = PageItem(base_url=self.service_url,
                        total_time=time.clock() - response.meta['start_time'],
                        id=crawl_data['id'],
                        url=crawl_data['url'],
                        imported_data_id=crawl_data['imported_data_id'],
                        category_id=crawl_data['category_id'],
                        body=body)
        return item

    def _handle_captcha(self, response):
        crawl_data = response.meta['crawl_data']
        captch_solve_try = response.meta.get('captch_solve_try', 0)

        self.log("Captcha challenge for %s (try %d)." %
                 (crawl_data.get('url'), captch_solve_try),
                 level=INFO)

        forms = Selector(response).xpath('//form')
        assert len(forms) == 1, "More than one form found."
        hidden_value1 = forms[0].xpath(
            '//input[@name="amzn"]/@value').extract()[0]
        hidden_value2 = forms[0].xpath(
            '//input[@name="amzn-r"]/@value').extract()[0]
        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha values: (%s) (%s) (%s)" %
                 (hidden_value1, hidden_value2, captcha_img),
                 level=DEBUG)
        captcha = self._solve_captcha(captcha_img)

        if captcha is None:
            err_msg = "Failed to guess captcha for '%s' (id: %s, try: %d)." % (
                crawl_data.get('url'), crawl_data.get('id'), captch_solve_try)
            self.log(err_msg, level=ERROR)
            result = RequestErrorItem(base_url=self.service_url,
                                      id=crawl_data['id'],
                                      http_code=response.status,
                                      error_string=err_msg)
        else:
            self.log("Submitting captcha '%s' for '%s' (try %d)." %
                     (captcha, captcha_img, captch_solve_try),
                     level=INFO)
            result = FormRequest.from_response(response,
                                               formname='',
                                               formdata={
                                                   'field-keywords': captcha,
                                               },
                                               callback=self.parse_target,
                                               errback=self.parse_target_err)
            result.meta['captch_solve_try'] = captch_solve_try + 1
            result.meta['crawl_data'] = response.meta['crawl_data']
            result.meta['start_time'] = response.meta['start_time']

        return result

    def parse_target_err(self, failure):
        url_id = failure.request.meta['crawl_data']['id']
        error_string = failure.getErrorMessage()
        if isinstance(failure.value, HttpError):
            status = failure.value.response.status
        else:
            status = 0
            self.log("Unhandled failure type '%s'. Will continue" %
                     type(failure.value),
                     level=ERROR)

        item = RequestErrorItem(base_url=self.service_url,
                                id=url_id,
                                http_code=status,
                                error_string=error_string)
        return item

    def _has_captcha(self, body):
        return '.images-amazon.com/captcha/' in body

    def _solve_captcha(self, captcha_url):
        return self._cbw.solve_captcha(captcha_url)
Exemplo n.º 12
0
    def __init__(self, captcha_retries='10', *args, **kwargs):

        super(AmazonBaseClass, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 13
0
class AmazonBaseClass(Spider):
    def __init__(self, captcha_retries='10', *args, **kwargs):

        super(AmazonBaseClass, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = self.parse_without_captcha(response)
        return result

    def parse_links(self, response):
        """
        Handles parsing of a top reviewers page.
        :param response:
        :return: ReviewItem's with Rank
        """
        raise NotImplementedError

    def parse_profile(self, response):
        """
        Handles parsing of a reviewer profile page.
        :param response:
        :return: ReviewItem's with Email, Name and Country
        """
        raise NotImplementedError

    def parse_without_captcha(self, response):
        if not self._has_captcha(response):
            res = self.parse_links(response)
            for i in res:
                yield i
        else:
            result = self._handle_captcha(response, self.parse_without_captcha)
            yield result

    def parse_email(self, response):
        if not self._has_captcha(response):
            result = self.parse_profile(response)
            if result:
                return result
        else:
            result = self._handle_captcha(response, self.parse_email)
            return result

    # Captcha handling functions.
    def _has_captcha(self, response):
        return 'images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)." %
                 (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log("Failed to guess captcha for '%s' (try: %d)." %
                     (url, captcha_solve_try),
                     level=ERROR)
            result = None
        else:
            self.log("On try %d, submitting captcha '%s' for '%s'." %
                     (captcha_solve_try, captcha, url),
                     level=INFO)
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result
Exemplo n.º 14
0
    def __init__(self,
                 url_formatter=None,
                 quantity=None,
                 page=None,
                 searchterms_str=None, searchterms_fn=None,
                 site_name=None,
                 product_url=None,
                 user_agent=None,
                 captcha_retries='10',
                 *args, **kwargs):
        if user_agent is None or user_agent not in self.USER_AGENTS.keys():
            self.log("Not available user agent type or it wasn't set."
                     " Default user agent will be used.", INFO)
            user_agent = 'default'

        if user_agent:
            self.user_agent = self.USER_AGENTS[user_agent]
            self.user_agent_key = user_agent

        super(AmazonSpider, self).__init__(*args, **kwargs)

        if site_name is None:
            assert len(self.allowed_domains) == 1, \
                "A single allowed domain is required to auto-detect site name."
            self.site_name = self.allowed_domains[0]
        else:
            self.site_name = site_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        if quantity is None:
            self.log("No quantity specified. Will retrieve all products.",
                     INFO)
            import sys
            self.quantity = sys.maxint
        else:
            self.quantity = int(quantity)

        if page is None:
            self.log("No page specified. Will retrieve all products.",
                     INFO)
            import sys
            self.page = sys.maxint
        else:
            self.page = int(page)

        self.product_url = product_url

        self.searchterms = []
        if searchterms_str is not None:
            self.searchterms = searchterms_str.decode('utf-8').split(',')
        elif searchterms_fn is not None:
            with open(searchterms_fn, encoding='utf-8') as f:
                self.searchterms = f.readlines()
        else:
            self.log("No search terms provided!", ERROR)

        self.log("Created for %s with %d search terms."
                 % (self.site_name, len(self.searchterms)), INFO)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()
Exemplo n.º 15
0
class AmazonSpider(Spider):
    name = 'amazon'
    allowed_domains = ["amazon.com"]
    start_urls = []

    SEARCH_URL = 'http://www.amazon.com/s/ref=sr_as_oo?' \
                 'rh=i%3Aaps%2Ck%3A{search_term}&keywords={search_term}'

    MAX_RETRIES = 3

    user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko'
                  '/20100101 Firefox/35.0')

    USER_AGENTS = {
        'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
            'Gecko/20100101 Firefox/35.0',
        'desktop': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
            'Gecko/20100101 Firefox/35.0',
        'iphone_ipad': 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_6 '\
            'like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) '\
            'Version/7.0 Mobile/11B651 Safari/9537.53',
        'android_phone': 'Mozilla/5.0 (Android; Mobile; rv:35.0) '\
            'Gecko/35.0 Firefox/35.0',
        'android_pad': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\
            'Gecko/35.0 Firefox/35.0',
        'android': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\
            'Gecko/35.0 Firefox/35.0',
    }

    def __init__(self,
                 url_formatter=None,
                 quantity=None,
                 page=None,
                 searchterms_str=None, searchterms_fn=None,
                 site_name=None,
                 product_url=None,
                 user_agent=None,
                 captcha_retries='10',
                 *args, **kwargs):
        if user_agent is None or user_agent not in self.USER_AGENTS.keys():
            self.log("Not available user agent type or it wasn't set."
                     " Default user agent will be used.", INFO)
            user_agent = 'default'

        if user_agent:
            self.user_agent = self.USER_AGENTS[user_agent]
            self.user_agent_key = user_agent

        super(AmazonSpider, self).__init__(*args, **kwargs)

        if site_name is None:
            assert len(self.allowed_domains) == 1, \
                "A single allowed domain is required to auto-detect site name."
            self.site_name = self.allowed_domains[0]
        else:
            self.site_name = site_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        if quantity is None:
            self.log("No quantity specified. Will retrieve all products.",
                     INFO)
            import sys
            self.quantity = sys.maxint
        else:
            self.quantity = int(quantity)

        if page is None:
            self.log("No page specified. Will retrieve all products.",
                     INFO)
            import sys
            self.page = sys.maxint
        else:
            self.page = int(page)

        self.product_url = product_url

        self.searchterms = []
        if searchterms_str is not None:
            self.searchterms = searchterms_str.decode('utf-8').split(',')
        elif searchterms_fn is not None:
            with open(searchterms_fn, encoding='utf-8') as f:
                self.searchterms = f.readlines()
        else:
            self.log("No search terms provided!", ERROR)

        self.log("Created for %s with %d search terms."
                 % (self.site_name, len(self.searchterms)), INFO)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()

    def make_requests_from_url(self, _):
        """This method does not apply to this type of spider so it is overriden
        and "disabled" by making it raise an exception unconditionally.
        """
        raise AssertionError("Need a search term.")

    def start_requests(self):
        """Generate Requests from the SEARCH_URL and the search terms."""
        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=urllib.quote_plus(st.encode('utf-8')),
                ),
                meta={'search_term': st, 'remaining': self.quantity},
            )

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = self.parse_without_captcha(response)
        return result

    def parse_without_captcha(self, response):
        if self._search_page_error(response):
            remaining = response.meta['remaining']
            search_term = response.meta['search_term']

            self.log("For search term '%s' with %d items remaining,"
                     " failed to retrieve search page: %s"
                     % (search_term, remaining, response.request.url),
                     WARNING)
        else:
            prods_count = -1  # Also used after the loop.
            for prods_count, request_or_prod in enumerate(
                    self._get_products(response)):
                yield request_or_prod
            prods_count += 1  # Fix counter.

            request = self._get_next_products_page(response, prods_count)
            if request is not None:
                yield request

    def _get_products(self, response):
        remaining = response.meta['remaining']
        search_term = response.meta['search_term']
        total_matches = response.meta.get('total_matches')

        prods = self._scrape_product_links(response)

        if total_matches is None:
            total_matches = self._scrape_total_matches(response)
            if total_matches is not None:
                response.meta['total_matches'] = total_matches
                self.log("Found %d total matches." % total_matches, INFO)
            else:
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to parse total matches for %s" %
                            response.url,ERROR)

        for i, (prod_item) in enumerate(islice(prods, 0, remaining)):
            prod_item['keyword'] = search_term
            prod_item['total_matches'] = total_matches
            prod_item['rank'] = (i + 1) + (self.quantity - remaining)
            yield prod_item


    def _get_next_products_page(self, response, prods_found):
        page_number = int(response.meta.get('page_number', 1))
        link_page_attempt = response.meta.get('link_page_attempt', 1)

        result = None
        if prods_found is not None:
            # This was a real product listing page.
            if page_number < self.page:
                remaining = response.meta['remaining']
                remaining -= prods_found
                next_page = self._scrape_next_results_page_link(response)
                if next_page is None:
                    pass
                else:
                    url = urlparse.urljoin(response.url, next_page)
                    new_meta = dict(response.meta)
                    new_meta['remaining'] = remaining
                    new_meta['page_number'] = page_number + 1
                    result = Request(url, self.parse, meta=new_meta, priority=1)
        elif link_page_attempt > self.MAX_RETRIES:
            self.log(
                "Giving up on results page after %d attempts: %s" % (
                    link_page_attempt, response.request.url),
                ERROR
            )
        else:
            self.log(
                "Will retry to get results page (attempt %d): %s" % (
                    link_page_attempt, response.request.url),
                WARNING
            )

            # Found no product links. Probably a transient error, lets retry.
            new_meta = response.meta.copy()
            new_meta['link_page_attempt'] = link_page_attempt + 1
            result = response.request.replace(
                meta=new_meta, cookies={}, dont_filter=True)

        return result

    def _scrape_total_matches(self, response):
        if response.css('#noResultsTitle'):
            return 0

        values = response.css('#s-result-count ::text').re(
            '([0-9,]+)\s[Rr]esults for')
        if not values:
            values = response.css('#resultCount > span ::text').re(
                '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults')
            if not values:
                values = response.css(
                    '#result-count-only-next'
                ).xpath(
                    'comment()'
                ).re(
                    '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+'
                )

        if values:
            total_matches = int(values[0].replace(',', ''))
        else:
            if not self.is_nothing_found(response):
                self.log(
                    "Failed to parse total number of matches for: %s"
                    % response.url,
                    level=ERROR
                )
            total_matches = None
        return total_matches

    def _scrape_product_links(self, response):
        products = response.xpath('//li[@class="s-result-item"]')

        for pr in products:
            if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")]'):
                continue
            product = ProductItem()

            cond_set(product, 'title',
                     pr.xpath('.//h2/../@title').extract())

            cond_set(product, 'product_image',
                     pr.xpath('.//img[@alt="Product Details"]/@src').extract())

            cond_set(product, 'brand',
                     pr.xpath(
                         './/div[@class="a-fixed-left-grid-col a-col-right"]'
                         '/div/div/span[2]/text()').extract())

            cond_set(product, 'price',
                     pr.xpath(
                        './/span[contains(@class,"s-price")]/text()'
                     ).extract())

            cond_set(product, 'asin', pr.xpath('@data-asin').extract())

            if pr.xpath('.//i[contains(@class, "a-icon-prime")]'):
                cond_set_value(product, 'prime', True)
            else:
                cond_set_value(product, 'prime', False)

            cond_set(product, 'shipping_price', pr.xpath(
                './/span[contains(@class,"s-price")]/'
                'following::span[2]/text()').re('(\d+.?\d+) shipping'))

            new = pr.xpath('.//a[contains(text(),"new")]/span/text()')

            if new:
                cond_set(product, 'new_price', new.extract())
                cond_set(product, 'new_offers', new[1].re('\d+'))

            used = pr.xpath('.//a[contains(text(),"used")]/span/text()')

            if used:
                cond_set(product, 'used_price', used.extract())
                cond_set(product, 'used_offers', used[1].re('\d+'))

            cond_set(product, 'rating', pr.xpath(
                './/span[contains(@name,"'+product['asin']+'")]/span/a/i/span'
            ).re('(\d+.?\d+)'))

            cond_set(product, 'number_of_reviews', pr.xpath(
                './/span[contains(@name,"'+product['asin']+'")]/'
                'following::a[1]/text()').re('([\d+,?]+\d+)'))

            cond_set(product, 'category', pr.xpath(
                './/span[contains(@class,"a-text-bold")]/text()'
            ).re('(.*):'))

            number_of_items = pr.xpath(
                './/span[contains(@class,"a-text-bold")]/../text()'
            ).re('([\d+,?]+\d+)')

            if number_of_items:
                cond_set_value(product, 'number_of_items', number_of_items[0])

            # product['url'] = pr.xpath('.//h2/../@href')[0].extract()
            # cond_set(product, 'url', pr.xpath('.//h2/../@href').extract())
            yield product

    def _scrape_next_results_page_link(self, response):
        next_pages = response.css('#pagnNextLink ::attr(href)').extract()
        next_page_url = None
        if len(next_pages) == 1:
            next_page_url = next_pages[0]
        elif len(next_pages) > 1:
            self.log("Found more than one 'next page' link.", ERROR)
        return next_page_url

    def is_nothing_found(self, response):
        txt = response.xpath('//h1[@id="noResultsTitle"]/text()').extract()
        txt = ''.join(txt)
        return 'did not match any products' in txt

    def _search_page_error(self, response):
        body = response.body_as_unicode()
        return "Your search" in body \
            and  "did not match any products." in body

    # Captcha handling functions.
    def _has_captcha(self, response):
        return '.images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)."
                 % (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log(
                "Failed to guess captcha for '%s' (try: %d)." % (
                    url, captcha_solve_try),
                level=ERROR
            )
            result = None
        else:
            self.log(
                "On try %d, submitting captcha '%s' for '%s'." % (
                    captcha_solve_try, captcha, url),
                level=INFO
            )
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result
Exemplo n.º 16
0
class AmazonBaseClass(Spider):
    def __init__(self,
                 captcha_retries='10',
                 *args, **kwargs):

        super(AmazonBaseClass, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = self.parse_without_captcha(response)
        return result

    def parse_links(self, response):
        """
        Handles parsing of a top reviewers page.
        :param response:
        :return: ReviewItem's with Rank
        """
        raise NotImplementedError

    def parse_profile(self, response):
        """
        Handles parsing of a reviewer profile page.
        :param response:
        :return: ReviewItem's with Email, Name and Country
        """
        raise NotImplementedError

    def parse_without_captcha(self, response):
        if not self._has_captcha(response):
            res = self.parse_links(response)
            for i in res:
                yield i
        else:
            result = self._handle_captcha(response, self.parse_without_captcha)
            yield result

    def parse_email(self, response):
        if not self._has_captcha(response):
            result = self.parse_profile(response)
            if result:
                return result
        else:
            result = self._handle_captcha(response, self.parse_email)
            return result

    # Captcha handling functions.
    def _has_captcha(self, response):
        return 'images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)."
                 % (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log(
                "Failed to guess captcha for '%s' (try: %d)." % (
                    url, captcha_solve_try),
                level=ERROR
            )
            result = None
        else:
            self.log(
                "On try %d, submitting captcha '%s' for '%s'." % (
                    captcha_solve_try, captcha, url),
                level=INFO
            )
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result
Exemplo n.º 17
0
class AmazonSpider(BaseSpider):
    name = "amazon"
    allowed_domains = ["amazon.com"]
    start_urls = [
        "http://www.amazon.com/gp/site-directory/ref=sa_menu_top_fullstore"
    ]

    def __init__(self, outfile=None, test_category=None):
        self.outfile = outfile

        # if this is set, only crawl this category (level 2/1 category name). used for testing
        self.test_category = test_category

        # if test category is set and no output file was specified, set the name of outfile to a special "test" name
        if self.test_category and not self.outfile:
            self.outfile = "amazon_categories_test.jl"

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 2

        # hardcoded toplevel categories (level 1 and 2) urls to replace/supplement some of the ones found on the sitemap above (point to the same category, but have different page content. they were found manually)
        # reason: they provide more info regarding product count than the ones found on the sitemap
        # keys are categories names as found in the sitemap, values are URLs associated with them, that will replace/supplement the links found on the sitemap
        self.EXTRA_TOPLEVEL_CATEGORIES_URLS = {
                                    "Baby" : "http://www.amazon.com/s/ref=lp_166835011_ex_n_1?rh=n%3A165796011&bbn=165796011&ie=UTF8&qid=1393338541", \
                                    "Electronics & Computers" : "http://www.amazon.com/s/ref=lp_172659_ex_n_1?rh=n%3A172282&bbn=172282&ie=UTF8&qid=1393338741", \
                                    "Home, Garden & Tools" : "http://www.amazon.com/s/ref=lp_284507_ex_n_1?rh=n%3A1055398&bbn=1055398&ie=UTF8&qid=1393338782",\
                                    "Kindle E-readers & Books" : "http://www.amazon.com/s/ref=lp_154606011_ex_n_1?rh=n%3A133140011&bbn=133140011&ie=UTF8&qid=1395704970", \
                                    "Apps & Games" : "http://www.amazon.com/b/ref=sd_allcat_fire_apps_games?ie=UTF8&node=3427287011", \
                                    "Movies & TV" : "http://www.amazon.com/action-adventure-dvd-bluray/b/ref=MoviesHPBB_Genres_Action?ie=UTF8&node=2650363011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-left-2&pf_rd_r=0GAWFEZ3EXP8PEYCM6X3&pf_rd_t=101&pf_rd_p=1753817742&pf_rd_i=2625373011", \
                                    "All Beauty" : "http://www.amazon.com/s/ref=lp_11059031_ex_n_1?rh=n%3A3760911&bbn=3760911&ie=UTF8&qid=1395793680",\
                                    "Health, Household & Baby Care" : "http://www.amazon.com/s/ref=lp_6183682011_ex_n_1?rh=n%3A3760901&bbn=3760901&ie=UTF8&qid=1395822180", \
                                    "Tires & Wheels" : "http://www.amazon.com/s/ref=lp_353609011_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A15706571&bbn=15706571&ie=UTF8&qid=1395824546", \
                                    "Motorcycle & Powersports" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A346333011&bbn=346333011&ie=UTF8&qid=1395824599", \
                                    "Automotive & Industrial" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181&bbn=15684181" # this is partial - "Automotive and industrial" also contains the "Industrial & Scientific" cats which can be found in the sitemap
                                    }

        # flag indicating whether to compute overall product counts in pipelines phase for this spider.
        # if on, 'catid' and 'parent_catid' fields need to be implemented
        self.compute_nrproducts = True

        # counter for department id, will be used to autoincrement department id
        self.department_count = 0
        # counter for category id
        self.catid = 0

        # level to stop crawling (don't extract subcategories below this level)
        self.LEVEL_BARRIER = -2

        # maximum number of retries when presented with captcha form
        self.MAX_CAPTCHA_RETRY = 10

        # dictionarties associating department names with other attributes - to use for setting parent category info for level 1 categories
        # associates department names with their ids
        self.departments_ids = {}
        # associates department names with their urls (will be available only for extra_categories)
        self.department_urls = {}
        # associate department names with their category ids
        self.departments_cat_ids = {}

        # captcha breaker
        self.CB = CaptchaBreakerWrapper()

    # solve the captcha on this page and redirect back to method that sent us here (callback)
    def solve_captcha_and_redirect(self, response, callback):
        hxs = HtmlXPathSelector(response)

        # solve captcha
        captcha_text = None
        image = hxs.select(".//img/@src").extract()
        if image:
            captcha_text = self.CB.solve_captcha(image[0])

        # value to use if there was an exception
        if not captcha_text:
            captcha_text = ''

        # create a FormRequest to this same URL, with everything needed in meta
        # items, cookies and search_urls not changed from previous response so no need to set them again

        # redirect to initial URL
        #return [FormRequest.from_response(response, callback = callback, formdata={'field-keywords' : captcha_text})]
        meta = response.meta
        # decrease count for retry times left. if not set yet, this is first attempt, set it to MAX_CAPTCHA_RETRY
        response.meta['retry_count'] = response.meta[
            'retry_count'] - 1 if 'retry_count' in response.meta else self.MAX_CAPTCHA_RETRY
        return FormRequest.from_response(
            response,
            callback=callback,
            formdata={'field-keywords': captcha_text},
            meta=meta)

    # test if page is form containing captcha
    def has_captcha(self, body):
        return '.images-amazon.com/captcha/' in body

    # check if 2 catgory names are the same
    # does some normalization of the names and compares the words in them
    # to be used for identifying EXTRA_TOPLEVEL_CATEGORIES_URLS when they occur in the sitemap
    def is_same_name(self, name1, name2):
        # eliminate non-word characters
        name1 = re.sub("[^a-zA-Z]", " ", name1).lower()
        name2 = re.sub("[^a-zA-Z]", " ", name2).lower()

        name1_words = name1.split()
        name2_words = name2.split()

        return set(name1_words) == set(name2_words)

    # find key in dict using is_same_name as equality function (return key from dict where is_same_name returns true for given target_key)
    def find_matching_key(self, target_key, dictionary):
        for key in dictionary:
            if self.is_same_name(target_key, key):
                return key

        return None

    # start parsing of top level categories extracted from sitemap; pass them to parseCategory
    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.parse
            )  # meta of response will contain number of retries left if set
            return

        links_level1 = hxs.select("//div[@id='siteDirectory']//table//a")
        titles_level1 = hxs.select("//div//table//h2")

        # add level 1 categories to items

        # first one is a special category ("Unlimited Instant Videos"), add it separately
        special_item = CategoryItem()
        special_item['text'] = titles_level1[0].select('text()').extract()[0]
        special_item['level'] = 2
        special_item['special'] = 1
        special_item['department_text'] = special_item['text']
        special_item['department_id'] = self.department_count
        self.department_count += 1

        special_item['catid'] = self.catid
        self.catid += 1

        self.departments_ids[
            special_item['text']] = special_item['department_id']
        self.departments_cat_ids[special_item['text']] = special_item['catid']

        #yield special_item

        # if test category is set, and this is not it, ignore
        if not self.test_category or special_item['text'] == self.test_category:
            yield special_item

        # the rest of the titles are not special
        for title in titles_level1[1:]:
            item = CategoryItem()
            item['text'] = title.select('text()').extract()[0]
            item['level'] = 2
            item['department_text'] = item['text']
            item['department_id'] = self.department_count
            self.department_count += 1

            item['catid'] = self.catid
            self.catid += 1

            self.departments_ids[item['text']] = item['department_id']
            self.departments_cat_ids[item['text']] = item['catid']

            # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, add info from that url
            extra_category = self.find_matching_key(
                item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS)
            if extra_category:
                item['url'] = self.EXTRA_TOPLEVEL_CATEGORIES_URLS[
                    extra_category]
                item['department_url'] = item['url']
                self.department_urls[item['text']] = item['url']

                # if self.test_category is set, only send request if this is the test category
                if self.test_category and item['text'] != self.test_category:
                    continue

                # parse this category further
                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

            else:
                # if test category is set and this is not it, ignore
                if self.test_category and item['text'] != self.test_category:
                    continue

                yield item

        # add level 1 categories to items
        for link in links_level1:
            item = CategoryItem()
            item['text'] = link.select('text()').extract()[0]
            root_url = "http://www.amazon.com"
            item['url'] = root_url + link.select('@href').extract()[0]
            item['level'] = 1

            parent = link.select(
                "parent::node()/parent::node()/preceding-sibling::node()")
            parent_text = parent.select('text()').extract()

            # category should have a parent (its department) and that parent should have been extracted earlier (above) and put in the ids dictionary, necessary for getting the department id
            assert parent_text
            assert parent_text[0] in self.departments_ids
            if parent_text:
                item['parent_text'] = parent_text[0]
                item['department_text'] = item['parent_text']
                item['department_id'] = self.departments_ids[
                    item['department_text']]
                item['parent_catid'] = self.departments_cat_ids[
                    item['department_text']]
                item['catid'] = self.catid
                self.catid += 1

                # get department url from department_urls, will be availble only for extra_categories
                if item['department_text'] in self.department_urls:
                    assert self.find_matching_key(
                        item['department_text'],
                        self.EXTRA_TOPLEVEL_CATEGORIES_URLS)
                    item['department_url'] = self.department_urls[
                        item['department_text']]
                    item['parent_url'] = item['department_url']

                    #TODO: leave this or not?
                    # Don't crawl subcategories of departments twice. If this is a department with url (extra_category), then we will crawl its subcategories. So ignore them here
                    #continue

                # if its parent is the special category, mark this one as special too
                if (item['parent_text'] == special_item['text']):
                    item['special'] = 1
                    special = True
                else:
                    special = False

            # department_id = self.department_count
            # self.department_count += 1

            # item['department_text'] = item['text']
            # item['department_url'] = item['url']
            # item['department_id'] = department_id

            # if self.test_category is set, only send request if this is the test category
            if self.test_category and item['text'] != self.test_category:
                continue

            yield Request(item['url'],
                          callback=self.parseCategory,
                          meta={'item': item})

    # parse category and return item corresponding to it (for categories where URL available - level 2 and lower)
    def parseCategory(self, response):

        # if we are getting blocked by captcha, solve and redirect back here
        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.parseCategory
            )  # meta of response will contain number of retries left if set
            return

        hxs = HtmlXPathSelector(response)

        # extract additional info for received parent and return it
        item = response.meta['item']

        # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url)
        if 'nr_products' not in item:
            prod_count_holder = hxs.select(
                "//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                prod_count = prod_count_holder[0]
                # extract number

                # for paged results: Showing ... out of ... Results
                m = re.match(".*\s*of\s+([0-9,]+)\s+Results\s*", prod_count)

                # for one page results: Showing ... Result(s)
                if not m:
                    m = re.match(".*\s+([0-9,]+)\s+Results?\s*", prod_count)

                if m:
                    item['nr_products'] = int(re.sub(",", "", m.group(1)))

        # extract description if available
        # only extracts descriptions that contain a h2. is that good?
        desc_holders = hxs.select(
            "//div[@class='unified_widget rcmBody'][descendant::h2][last()]")
        # select the one among these with the most text
        #TODO: another idea: check if the holder has a h2 item
        if desc_holders:
            maxsize = 0
            max_desc_holder = desc_holders[0]
            for desc_holder in desc_holders:
                size = len(" ".join(desc_holder.select(".//text()").extract()))

                if size > maxsize:
                    maxsize = size
                    max_desc_holder = desc_holder
            desc_holder = max_desc_holder
            desc_title = desc_holder.select("h2/text()").extract()
            if desc_title:
                item['description_title'] = desc_title[0].strip()

            description_texts = desc_holder.select(
                ".//text()[not(ancestor::h2)]").extract()

            # if the list is not empty and contains at least one non-whitespace item
            # if there is a description title or the description body is large enough
            size_threshold = 50
            if (description_texts
                    and reduce(lambda x, y: x or y,
                               [line.strip()
                                for line in description_texts])):  # and \
                #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                if desc_title:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, and no product count was found, add info from that url
        extra_category = self.find_matching_key(
            item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS)

        # crawl lower level categories
        if item['level'] > self.LEVEL_BARRIER:
            if extra_category:

                # collect number of products from this alternate URL
                # this will also extract subcategories and their count
                yield Request(
                    self.EXTRA_TOPLEVEL_CATEGORIES_URLS[extra_category],
                    callback=self.extractSubcategories,
                    meta={'item': item})

            else:
                # extract subcategories and their count for category even if not in extra_...
                yield Request(item['url'],
                              callback=self.extractSubcategories,
                              meta={'item': item})
        else:
            yield item

    # extract and yield subcategories for a category
    # use menu on left side of the page on the category page
    # will mainly be used for categories in EXTRA_TOPLEVEL_CATEGORIES_URLS

    # after subcategories extracted, send them to parseCategory to extract description as well
    # Obs: it's not exhaustive. if page doesn't match what it expects, it gives up
    def extractSubcategories(self, response):

        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.extractSubcategories
            )  # meta of response will contain number of retries left if set
            return

        hxs = HtmlXPathSelector(response)

        # returned received item, then extract its subcategories
        parent_item = response.meta['item']

        yield parent_item

        # extract subcategories, if level is above barrier
        # extract subcategories from first menu on the left, assume this is the subcategories menu

        if parent_item['level'] > self.LEVEL_BARRIER:

            # check if it should be treated as a special category (exceptions to usual page structure); then extract the subcategories with the appropriate method
            if self.isSpecialCategoryMenu(parent_item):
                subcategories = self.extractSubcategoriesFromMenuSpecial(
                    hxs, parent_item)

                # if no subcategories were found, try with the regular extraction as well (ex http://www.amazon.com/clothing-accessories-men-women-kids/b/ref=sd_allcat_apr/179-7724806-1781144?ie=UTF8&node=1036592)
                if not subcategories:
                    subcategories = self.extractSubcategoriesFromMenu(hxs)

            else:
                subcategories = self.extractSubcategoriesFromMenu(hxs)

            for (subcategory_text, subcategory_url,
                 subcategory_prodcount) in subcategories:

                item = CategoryItem()
                item['url'] = subcategory_url
                item['text'] = subcategory_text
                item['catid'] = self.catid
                self.catid += 1

                if subcategory_prodcount:
                    item['nr_products'] = int(subcategory_prodcount)

                item['parent_text'] = parent_item['text']
                item['parent_url'] = parent_item['url']
                item['parent_catid'] = parent_item['catid']

                # considering departments to be level 2 categories (top level) - so every category must have a department text
                assert 'department_text' in parent_item
                if 'department_text' in parent_item:
                    item['department_text'] = parent_item['department_text']
                    #item['department_url'] = parent_item['department_url']
                    item['department_id'] = parent_item['department_id']

                # only level 2 categories in extra_categories have department_url
                if 'department_url' in parent_item:
                    item['department_url'] = parent_item['department_url']
                else:
                    assert not self.find_matching_key(
                        item['department_text'],
                        self.EXTRA_TOPLEVEL_CATEGORIES_URLS)

                # else:
                #     # the parent must be a level 2 category - so this will be considered department
                #     assert parent_item['level'] == 2
                #     item['department_text'] = item['text']
                #     #item['department_url'] = item['url']
                #     item['department_id'] = self.department_count
                #     self.department_count += 1

                item['level'] = parent_item['level'] - 1

                # # no description extracted
                # item['description_wc'] = 0

                # send to parseCategory to extract description as well
                yield Request(item['url'],
                              callback=self.parseCategory,
                              meta={'item': item})

    # given a page (selector for it), extract subcategories from menu on the left
    # return generator of tuples representing subcategories with (name, url, item count)
    def extractSubcategoriesFromMenu(self, hxs):

        # extract subcategories for regular page structure
        subcategories = hxs.select(
            "//h2[text()='Department']/following-sibling::ul[1]/li/a")
        # only try "Shop by Department" if there is no "Department", otherwise might cause problems when both are present. e.g (http://www.amazon.com/Watches-Mens-Womens-Kids-Accessories/b/ref=sd_allcat_watches/187-9021585-5419616?ie=UTF8&node=377110011)
        if not subcategories:
            subcategories = hxs.select(
                "(//h2 | //h3)[text()='Shop by Department']/following-sibling::ul[1]/li/a"
            )

        for subcategory in subcategories:
            # if we have a subcategory URL and product count with the expected format extract it, otherwise move on

            # there is an exception to this refinement link rule - then extract info directly from subcategory node, but only if len(text)>1 (otherwise we catch all the little arrows for parent cats)
            if not subcategory.select("span[@class='refinementLink']"):
                if len(subcategory.select(".//text()").extract()
                       [0].strip()) > 1:  # so it's not that little arrow thing
                    subcategory_text_holder = subcategory.select(
                        "text()[normalize-space()!='']").extract()
                    if subcategory_text_holder:
                        subcategory_text = subcategory_text_holder[0].strip()
                    else:
                        continue
                    subcategory_url_holder = subcategory.select(
                        "@href").extract()
                    if subcategory_url_holder:
                        subcategory_url = Utils.add_domain(
                            subcategory_url_holder[0], "http://www.amazon.com")
                    else:
                        continue
                    subcategory_prodcount_holder = None
                else:
                    continue

            else:

                subcategory_url = Utils.add_domain(
                    subcategory.select("@href").extract()[0],
                    "http://www.amazon.com")
                subcategory_text = subcategory.select(
                    "span[@class='refinementLink']//text()").extract(
                    )[0].strip()
                # extract product count, clean it of commas and parantheses
                subcategory_prodcount_holder = subcategory.select(
                    "span[@class='narrowValue']/text()").extract()

            # if there's also product count available in the menu, extract it
            if subcategory_prodcount_holder:
                subcategory_prodcount = subcategory_prodcount_holder[
                    0].replace(";nbsp&", " ").strip()

                m = re.match("\(([0-9,]+)\)", subcategory_prodcount)
                if m:
                    subcategory_prodcount = m.group(1).replace(",", "")
            else:
                subcategory_prodcount = None

            yield (subcategory_text, subcategory_url, subcategory_prodcount)

    # extract subcategories from category page from special category pages that do not conform to regular page structure
    # return list of nodes containing the subcategories
    # check which category this is and send to specific method for extracting subcategories
    def extractSubcategoriesFromMenuSpecial(self, hxs, category):
        cat_title = category['text']
        if cat_title in ["Team Sports", "All Sports & Outdoors"]:
            return self.extractSubcategoriesSports(hxs)
        if category['text'] == 'Accessories' and ("Clothing"
                                                  in category['parent_text']):
            return self.extractSubcategoriesAccessories(hxs)

    # extract subcategories for special category: Sports
    def extractSubcategoriesSports(self, hxs):
        subcategories = hxs.select(
            "//h3[text()='Shop by Sport']/following-sibling::ul[1]/li/a")

        for subcategory in subcategories:
            subcategory_name = subcategory.select("text()").extract()[0]
            subcategory_url = Utils.add_domain(
                subcategory.select("@href").extract()[0],
                "http://www.amazon.com")

            yield (subcategory_name, subcategory_url, None)

    # extract subcategories for special category: Accessories in Clothing
    def extractSubcategoriesAccessories(self, hxs):
        subcategories = hxs.select("//a[contains(text(),'Shop All')]")
        for subcategory in subcategories:
            # extract words after "Shop All" - that is the subcategory name
            subcategory_text_full = subcategory.select("text()").extract()[0]
            m = re.match("Shop All (.*)", subcategory_text_full)
            subcategory_name = m.group(1).strip()
            subcategory_url = Utils.add_domain(
                subcategory.select("@href").extract()[0],
                "http://www.amazon.com")

            yield (subcategory_name, subcategory_url, None)

    # check if category is special and subcategories from its menu should be extracted in a specific way
    #TODO: replace these tests with some tests based on URL, more robust (after figuring out which is the stable part of the url)
    def isSpecialCategoryMenu(self, category):
        # category names with special page structure whose subcategories menu need to be parsed specifically
        # these are the titles found on the respective categories' pages
        SUBCATS_MENU_SPECIAL = ['Team Sports', 'All Sports & Outdoors']
        if category['text'] in SUBCATS_MENU_SPECIAL:
            return True

        if category['text'] == 'Accessories' and ("Clothing"
                                                  in category['parent_text']):
            #print "IS SPECIAL", category['url']
            return True
Exemplo n.º 18
0
class AmazonSpider(Spider):
    name = 'amazon'
    allowed_domains = ["amazon.com"]
    start_urls = []

    # SEARCH_URL = 'http://www.amazon.com/s/ref=sr_as_oo?' \
    #              'rh=i%3Aaps%2Ck%3A{search_term}&keywords={search_term}'

    SEARCH_URL = 'http://www.amazon.com/s/ref=nb_sb_noss_2' \
                 '?url=search-alias%3Daps&field-keywords={search_term}'

    MAX_RETRIES = 3

    user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko'
                  '/20100101 Firefox/35.0')

    USER_AGENTS = {
        'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
            'Gecko/20100101 Firefox/35.0',
        'desktop': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\
            'Gecko/20100101 Firefox/35.0',
        'iphone_ipad': 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_6 '\
            'like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) '\
            'Version/7.0 Mobile/11B651 Safari/9537.53',
        'android_phone': 'Mozilla/5.0 (Android; Mobile; rv:35.0) '\
            'Gecko/35.0 Firefox/35.0',
        'android_pad': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\
            'Gecko/35.0 Firefox/35.0',
        'android': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\
            'Gecko/35.0 Firefox/35.0',
    }

    def __init__(self,
                 url_formatter=None,
                 quantity=None,
                 page=None,
                 searchterms_str='laptop',
                 searchterms_fn=None,
                 site_name=None,
                 product_url=None,
                 user_agent=None,
                 captcha_retries='10',
                 *args,
                 **kwargs):
        if user_agent is None or user_agent not in self.USER_AGENTS.keys():
            self.log(
                "Not available user agent type or it wasn't set."
                " Default user agent will be used.", INFO)
            user_agent = 'default'

        if user_agent:
            self.user_agent = self.USER_AGENTS[user_agent]
            self.user_agent_key = user_agent

        super(AmazonSpider, self).__init__(*args, **kwargs)

        if site_name is None:
            assert len(self.allowed_domains) == 1, \
                "A single allowed domain is required to auto-detect site name."
            self.site_name = self.allowed_domains[0]
        else:
            self.site_name = site_name

        if url_formatter is None:
            self.url_formatter = string.Formatter()
        else:
            self.url_formatter = url_formatter

        if quantity is None:
            self.log("No quantity specified. Will retrieve all products.",
                     INFO)
            import sys
            self.quantity = sys.maxint
        else:
            self.quantity = int(quantity)

        if page is None:
            self.log("No page specified. Will retrieve all products.", INFO)
            import sys
            self.page = sys.maxint
        else:
            self.page = int(page)

        self.product_url = product_url

        self.searchterms = []
        if searchterms_str is not None:
            self.searchterms = searchterms_str.decode('utf-8').split(',')
        elif searchterms_fn is not None:
            with open(searchterms_fn, encoding='utf-8') as f:
                self.searchterms = f.readlines()
        else:
            self.log("No search terms provided!", ERROR)

        self.log(
            "Created for %s with %d search terms." %
            (self.site_name, len(self.searchterms)), INFO)

        self.captcha_retries = int(captcha_retries)
        self._cbw = CaptchaBreakerWrapper()

    def make_requests_from_url(self, _):
        """This method does not apply to this type of spider so it is overriden
        and "disabled" by making it raise an exception unconditionally.
        """
        raise AssertionError("Need a search term.")

    def start_requests(self):
        """Generate Requests from the SEARCH_URL and the search terms."""
        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=urllib.quote_plus(st.encode('utf-8')),
                ),
                meta={
                    'search_term': st,
                    'remaining': self.quantity
                },
            )

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = self.parse_without_captcha(response)
        return result

    def parse_without_captcha(self, response):
        if self._search_page_error(response):
            remaining = response.meta['remaining']
            search_term = response.meta['search_term']

            self.log(
                "For search term '%s' with %d items remaining,"
                " failed to retrieve search page: %s" %
                (search_term, remaining, response.request.url), WARNING)
        else:
            prods_count = -1  # Also used after the loop.
            for prods_count, request_or_prod in enumerate(
                    self._get_products(response)):
                yield request_or_prod
            prods_count += 1  # Fix counter.

            request = self._get_next_products_page(response, prods_count)
            if request is not None:
                yield request

    def _get_products(self, response):
        remaining = response.meta['remaining']
        search_term = response.meta['search_term']
        total_matches = response.meta.get('total_matches')

        prods = self._scrape_product_links(response)

        if total_matches is None:
            total_matches = self._scrape_total_matches(response)
            if total_matches is not None:
                response.meta['total_matches'] = total_matches
                self.log("Found %d total matches." % total_matches, INFO)
            else:
                if hasattr(self, 'is_nothing_found'):
                    if not self.is_nothing_found(response):
                        self.log(
                            "Failed to parse total matches for %s" %
                            response.url, ERROR)

        for i, (prod_item) in enumerate(islice(prods, 0, remaining)):
            prod_item['keyword'] = search_term
            prod_item['total_matches'] = total_matches
            prod_item['rank'] = (i + 1) + (self.quantity - remaining)
            yield prod_item

    def _get_next_products_page(self, response, prods_found):
        page_number = int(response.meta.get('page_number', 1))
        link_page_attempt = response.meta.get('link_page_attempt', 1)

        result = None
        if prods_found is not None:
            # This was a real product listing page.
            if page_number < self.page:
                remaining = response.meta['remaining']
                remaining -= prods_found
                next_page = self._scrape_next_results_page_link(response)
                if next_page is None:
                    pass
                else:
                    url = urlparse.urljoin(response.url, next_page)
                    new_meta = dict(response.meta)
                    new_meta['remaining'] = remaining
                    new_meta['page_number'] = page_number + 1
                    result = Request(url,
                                     self.parse,
                                     meta=new_meta,
                                     priority=1)
        elif link_page_attempt > self.MAX_RETRIES:
            self.log(
                "Giving up on results page after %d attempts: %s" %
                (link_page_attempt, response.request.url), ERROR)
        else:
            self.log(
                "Will retry to get results page (attempt %d): %s" %
                (link_page_attempt, response.request.url), WARNING)

            # Found no product links. Probably a transient error, lets retry.
            new_meta = response.meta.copy()
            new_meta['link_page_attempt'] = link_page_attempt + 1
            result = response.request.replace(meta=new_meta,
                                              cookies={},
                                              dont_filter=True)

        return result

    def _scrape_total_matches(self, response):
        if response.css('#noResultsTitle'):
            return 0

        values = response.css('#s-result-count ::text').re(
            '([0-9,]+)\s[Rr]esults for')
        if not values:
            values = response.css('#resultCount > span ::text').re(
                '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults')
            if not values:
                values = response.css('#result-count-only-next').xpath(
                    'comment()').re('\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+')

        if values:
            total_matches = int(values[0].replace(',', ''))
        else:
            if not self.is_nothing_found(response):
                self.log("Failed to parse total number of matches for: %s" %
                         response.url,
                         level=ERROR)
            total_matches = None
        return total_matches

    def _scrape_product_links(self, response):
        products = response.xpath('//li[@class="s-result-item"]')

        for pr in products:
            if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")] |'
                        './/h5[contains(text(), "Sponsored")]'):
                continue
            product = ProductItem()

            cond_set(product, 'title', pr.xpath('.//h2/../@title').extract())

            cond_set(product, 'product_image',
                     pr.xpath('.//img[@alt="Product Details"]/@src').extract())

            cond_set(
                product, 'brand',
                pr.xpath('.//div[@class="a-fixed-left-grid-col a-col-right"]'
                         '/div/div/span[2]/text() |'
                         './/div[@class="a-row a-spacing-mini"]/span[2]/text()'
                         ).extract())

            cond_set(
                product, 'price',
                pr.xpath(
                    './/span[contains(@class,"s-price")]/text()').extract())

            cond_set(product, 'asin', pr.xpath('@data-asin').extract())

            if pr.xpath('.//i[contains(@class, "a-icon-prime")]'):
                cond_set_value(product, 'prime', True)
            else:
                cond_set_value(product, 'prime', False)

            cond_set(
                product, 'shipping_price',
                pr.xpath(
                    './/span[contains(@class,"s-price")]/'
                    'following::span[2]/text()').re('(\d+.?\d+) shipping'))

            new = pr.xpath('.//a[contains(text(),"new")]/span/text()')

            if new:
                cond_set(product, 'new_price', new.extract())
                cond_set(product, 'new_offers', new[1].re('\d+'))

            used = pr.xpath('.//a[contains(text(),"used")]/span/text()')

            if used:
                cond_set(product, 'used_price', used.extract())
                cond_set(product, 'used_offers', used[1].re('\d+'))

            cond_set(
                product, 'rating',
                pr.xpath('.//span[contains(@name,"' + product['asin'] +
                         '")]/span/a/i/span').re('(\d+.?\d+)'))

            cond_set(
                product, 'number_of_reviews',
                pr.xpath('.//span[contains(@name,"' + product['asin'] + '")]/'
                         'following::a[1]/text()').re('([\d+,?]+\d+)'))

            category = pr.xpath(
                './/span[contains(@class,"a-text-bold")]/text()').re('(.*):')

            if not category:
                category = response.xpath(
                    '//div[@id="autoscoping-backlink"]/div/span/span/text()'
                ).extract()

            cond_set(product, 'category', category)

            number_of_items = pr.xpath(
                './/span[contains(@class,"a-text-bold")]/../text()').re(
                    '([\d+,?]+\d+)')

            if number_of_items:
                cond_set_value(product, 'number_of_items', number_of_items[0])
            else:
                cond_set_value(product, 'number_of_items',
                               response.meta.get('total_matches'))

            product['all_brands'] = response.xpath(
                '//h2[text()="Brand"]/following::ul[1]/'
                'li[@class="refinementImage"]/a/span/text()').extract()

            yield product

    def _scrape_next_results_page_link(self, response):
        next_pages = response.css('#pagnNextLink ::attr(href)').extract()
        next_page_url = None
        if len(next_pages) == 1:
            next_page_url = next_pages[0]
        elif len(next_pages) > 1:
            self.log("Found more than one 'next page' link.", ERROR)
        return next_page_url

    def is_nothing_found(self, response):
        txt = response.xpath('//h1[@id="noResultsTitle"]/text()').extract()
        txt = ''.join(txt)
        return 'did not match any products' in txt

    def _search_page_error(self, response):
        body = response.body_as_unicode()
        return "Your search" in body \
            and  "did not match any products." in body

    # Captcha handling functions.
    def _has_captcha(self, response):
        return '.images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)." %
                 (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log("Failed to guess captcha for '%s' (try: %d)." %
                     (url, captcha_solve_try),
                     level=ERROR)
            result = None
        else:
            self.log("On try %d, submitting captcha '%s' for '%s'." %
                     (captcha_solve_try, captcha, url),
                     level=INFO)
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result
Exemplo n.º 19
0
    print str(e)

try:
    from captcha_solver import CaptchaBreakerWrapper
except Exception as e:
    print '!!!!!!!!Captcha breaker is not available due to: %s' % e

    class CaptchaBreakerWrapper(object):
        @staticmethod
        def solve_captcha(url):
            msg("CaptchaBreaker in not available for url: %s" % url,
                level=WARNING)
            return None


_cbw = CaptchaBreakerWrapper()


def _has_captcha(response):
    return '.images-amazon.com/captcha/' in response.content


def _solve_captcha(response):
    soup = BeautifulSoup(response.content, "html.parser")
    forms = soup.findAll(itemprop="image")
    assert len(forms) == 1, "More than one form found."

    captcha_img = forms[0]['src']

    return _cbw.solve_captcha(captcha_img)
Exemplo n.º 20
0
class AmazonProductsSpider(AmazonTests, BaseProductsSpider):
    name = 'amazon_products'
    allowed_domains = ["amazon.com"]

    user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko'
                  '/20100101 Firefox/35.0')

    SEARCH_URL = ('http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias'
                  '%3Daps&field-keywords={search_term}')

    settings = AmazonValidatorSettings

    buyer_reviews_stars = [
        'one_star', 'two_star', 'three_star', 'four_star', 'five_star'
    ]

    def __init__(self, captcha_retries='10', *args, **kwargs):
        super(AmazonProductsSpider, self).__init__(*args, **kwargs)

        self.captcha_retries = int(captcha_retries)

        self.mtp_class = Amazon_marketplace(self)

        self._cbw = CaptchaBreakerWrapper()

    def parse(self, response):
        if self._has_captcha(response):
            result = self._handle_captcha(response, self.parse)
        else:
            result = super(AmazonProductsSpider, self).parse(response)
        return result

    def _get_products(self, response):
        result = super(AmazonProductsSpider, self)._get_products(response)
        for r in result:
            if isinstance(r, Request):
                r = r.replace(dont_filter=True)
                yield r
            else:
                yield r

    def parse_product(self, response):
        prod = response.meta['product']

        prod['buyer_reviews'] = self._build_buyer_reviews(response)

        if not self._has_captcha(response):
            self._populate_from_js(response, prod)

            self._populate_from_html(response, prod)

            cond_set_value(prod, 'locale', 'en-US')  # Default locale.

            mkt_place_link = urlparse.urljoin(
                response.url,
                is_empty(response.xpath(
                    "//div[contains(@class, 'a-box-inner')]" \
                    "//a[contains(@href, '/gp/offer-listing/')]/@href |" \
                    "//div[@id='secondaryUsedAndNew']" \
                    "//a[contains(@href, '/gp/offer-listing/')]/@href"
                ).extract()))

            new_meta = response.meta.copy()
            new_meta['product'] = prod
            if isinstance(prod["buyer_reviews"], Request):
                if mkt_place_link:
                    new_meta["mkt_place_link"] = mkt_place_link
                return prod["buyer_reviews"].replace(meta=new_meta,
                                                     dont_filter=True)

            if mkt_place_link:
                return Request(url=mkt_place_link,
                               callback=self.parse_marketplace,
                               meta=new_meta,
                               dont_filter=True)

            result = prod

        elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries:
            self.log("Giving up on trying to solve the captcha challenge after"
                     " %s tries for: %s" % (self.captcha_retries, prod['url']),
                     level=WARNING)
            result = None
        else:
            result = self._handle_captcha(response, self.parse_product)

        return result

    def _get_price(self, response, product):
        """ Parses and sets the product price, with all possible variations
        :param response: Scrapy's Response obj
        :param product: Scrapy's Item (dict, basically)
        :return: None
        """
        cond_set(
            product,
            'price',
            response.css('#priceblock_ourprice ::text'
                         ', #unqualifiedBuyBox .a-color-price ::text'
                         ', #priceblock_saleprice ::text'
                         ', #actualPriceValue ::text'
                         ', #buyNewSection .offer-price ::text').extract(),
        )
        if not product.get('price', None):
            cond_set(
                product, 'price',
                response.xpath(
                    '//td/b[@class="priceLarge"]/text() |'
                    '//span[@class="olp-padding-right"]'
                    '/span[@class="a-color-price"]/text() |'
                    '//div[contains(@data-reftag,"atv_dp_bb_est_hd_movie")]'
                    '/button/text() |'
                    '//span[@id="priceblock_saleprice"]/text() |'
                    '//li[@class="swatchElement selected"]'
                    '//span[@class="a-color-price"]/text() |'
                    '//div[contains(@data-reftag,"atv_dp_bb_est_sd_movie")]'
                    '/button/text() |'
                    '//div[@id="mocaBBRegularPrice"]'
                    '/div/text()[normalize-space()]').extract())
        if product.get('price', None):
            if not '$' in product['price']:
                if 'FREE' in product['price'] or ' ' in product['price']:
                    product['price'] = Price(priceCurrency='USD', price='0.00')
                else:
                    self.log('Currency symbol not recognized: %s' %
                             response.url,
                             level=ERROR)
            else:
                price = re.findall('[\d ,.]+\d', product['price'])
                price = re.sub('[, ]', '', price[0])
                product['price'] = Price(
                    priceCurrency='USD',
                    price=price.replace('$', '').strip()\
                        .replace(',', '')
                )

    def populate_bestseller_rank(self, product, response):
        ranks = {
            ' > '.join(
                map(unicode.strip,
                    itm.css('.zg_hrsr_ladder a::text').extract())):
            int(
                re.sub('[ ,]', '',
                       itm.css('.zg_hrsr_rank::text').re('([\d, ]+)')[0]))
            for itm in response.css('.zg_hrsr_item')
        }
        prim = response.css('#SalesRank::text, #SalesRank .value'
                            '::text').re('#([\d ,]+) .*in (.+)\(')
        if prim:
            prim = {prim[1].strip(): int(re.sub('[ ,]', '', prim[0]))}
            ranks.update(prim)
        ranks = [{'category': k, 'rank': v} for k, v in ranks.iteritems()]
        cond_set_value(product, 'category', ranks)
        # parse department
        department = amazon_parse_department(ranks)
        if department is None:
            product['department'] = None
        else:
            product['department'], product['bestseller_rank'] \
                = department.items()[0]

    def _populate_from_html(self, response, product):
        cond_set(product, 'brand', response.css('#brand ::text').extract())
        self._get_price(response, product)

        brand_name = is_empty(
            response.xpath('//a[@id="brand"]/text()').extract())
        cond_set(product, 'brand', brand_name)

        av = AmazonVariants()
        av.setupSC(response)
        product['variants'] = av._variants()

        brand_logo = is_empty(
            response.xpath('//a[@id="brand"]/@href').extract())
        if brand_logo:
            brand = brand_logo.split('/')[1]
            cond_set_value(product, 'brand', brand)

        self.mtp_class.get_price_from_main_response(response, product)

        spans = response.xpath('//span[@class="a-text-bold"]')
        for span in spans:
            text = is_empty(span.xpath('text()').extract())
            if text and 'Item model number:' in text:
                possible_model = span.xpath('../span/text()').extract()
                if len(possible_model) > 1:
                    model = possible_model[1]
                    cond_set_value(product, 'model', model)

        description = response.css('.productDescriptionWrapper').extract()
        if not description:
            iframe_content = re.findall(r'var iframeContent = "(.*)"',
                                        response.body)
            if iframe_content:
                res = iframe_content[0]
                f = re.findall(
                    'body%3E%0A%20%20(.*)'
                    '%0A%20%20%3C%2Fbody%3E%0A%3C%2Fhtml%3E%0A', res)
                if f:
                    desc = unquote(f[0])
                    description = [desc]
        if not description:
            description = response.xpath(
                '//div[@id="descriptionAndDetails"] |'
                '//div[@id="feature-bullets"] |'
                '//div[@id="ps-content"] |'
                '//div[@id="productDescription_feature_div"] |'
                '//div[contains(@class, "dv-simple-synopsis")] |'
                '//div[@class="bucket"]/div[@class="content"]').extract()

        cond_set(
            product,
            'description',
            description,
        )

        image = response.css(
            '#imgTagWrapperId > img ::attr(data-old-hires)').extract()
        if not image:
            j = re.findall(r"'colorImages': { 'initial': (.*)},",
                           response.body)
            if not j:
                j = re.findall(r'colorImages = {"initial":(.*)}',
                               response.body)
            if j:
                try:
                    res = json.loads(j[0])
                    try:
                        image = res[0]['large']
                    except:
                        image = res[1]['large']
                    image = [image]
                except:
                    pass
        if not image:
            image = response.xpath(
                '//div[@class="main-image-inner-wrapper"]/img/@src |'
                '//div[@id="coverArt_feature_div"]//img/@src |'
                '//div[@id="img-canvas"]/img/@src |'
                '//div[@class="dp-meta-icon-container"]/img/@src |'
                '//input[@id="mocaGlamorImageUrl"]/@value |'
                '//div[@class="egcProdImageContainer"]'
                '/img[@class="egcDesignPreviewBG"]/@src |'
                '//img[@id="main-image"]/@src').extract()

        if len(image) > 0 and image[0]:
            if product.get('image_url'):
                product['image_url'] = image[0]
            else:
                cond_set(product, 'image_url', image)

        title = response.css('#productTitle ::text').extract()
        if not title:
            title = response.xpath(
                '//div[@class="buying"]/h1/span[@id="btAsinTitle"]/text() |'
                '//div[@id="title_feature_div"]/h1/text() |'
                '//div[@id="title_row"]/span/h1/text() |'
                '//h1[@id="aiv-content-title"]/text() |'
                '//div[@id="item_name"]/text()').extract()
        if not title:
            parts = response.xpath(
                '//div[@id="mnbaProductTitleAndYear"]/span/text()').extract()
            if parts:
                title = ''
                for part in parts:
                    title += part
                title = [title]
        cond_set(product, 'title', title)

        # Some data is in a list (ul element).
        model = None
        for li in response.css('td.bucket > .content > ul > li'):
            raw_keys = li.xpath('b/text()').extract()
            if not raw_keys:
                # This is something else, ignore.
                continue

            key = raw_keys[0].strip(' :').upper()
            if key == 'UPC':
                # Some products have several UPCs.
                raw_upc = li.xpath('text()').extract()[0]
                cond_set_value(
                    product,
                    'upc',
                    raw_upc.strip().replace(' ', ';'),
                )
            elif key == 'ASIN' and model is None or key == 'ITEM MODEL NUMBER':
                model = li.xpath('text()').extract()
        cond_set(product, 'model', model, conv=string.strip)
        self.populate_bestseller_rank(product, response)

    def _populate_from_js(self, response, product):
        # Images are not always on the same spot...
        img_jsons = response.css(
            '#landingImage ::attr(data-a-dynamic-image)').extract()
        if img_jsons:
            img_data = json.loads(img_jsons[0])
            cond_set_value(product,
                           'image_url',
                           max(img_data.items(), key=lambda
                               (_, size): size[0]),
                           conv=lambda (url, _): url)

    def _get_rating_by_star_by_individual_request(self, response):
        product = response.meta['product']
        mkt_place_link = response.meta.get("mkt_place_link")
        current_star = response.meta['_current_star']
        current_star_int = [
            i + 1 for i, _star in enumerate(self.buyer_reviews_stars)
            if _star == current_star
        ][0]
        br = product.get('buyer_reviews')
        if br:
            rating_by_star = br.get('rating_by_star')
        else:
            if mkt_place_link:
                return self.mkt_request(mkt_place_link, {"product": product})
            return product
        if not rating_by_star:
            rating_by_star = {}
        num_of_reviews_for_star = re.search(
            r'Showing .+? of ([\d,\.]+) reviews', response.body)
        if num_of_reviews_for_star:
            num_of_reviews_for_star = num_of_reviews_for_star.group(1)
            num_of_reviews_for_star = num_of_reviews_for_star\
                .replace(',', '').replace('.', '')
            rating_by_star[str(current_star_int)] \
                = int(num_of_reviews_for_star)
        if not str(current_star_int) in rating_by_star.keys():
            rating_by_star[str(current_star_int)] = 0

        product['buyer_reviews']['rating_by_star'] = rating_by_star
        if len(product['buyer_reviews']['rating_by_star']) >= 5:
            product['buyer_reviews']['num_of_reviews'] \
                = int(product['buyer_reviews']['num_of_reviews'])
            product['buyer_reviews']['average_rating'] \
                = float(product['buyer_reviews']['average_rating'])
            # ok we collected all marks for all stars - can return the product
            product['buyer_reviews'] = BuyerReviews(**product['buyer_reviews'])
            if mkt_place_link:
                return self.mkt_request(mkt_place_link, {"product": product})
            return product

    def _get_asin_from_url(self, url):
        match = re.search(r'/([A-Z0-9]{4,15})/', url)
        if match:
            return match.group(1)

    def _create_post_requests(self, response, asin):
        url = ('http://www.amazon.com/ss/customer-reviews/ajax/reviews/get/'
               'ref=cm_cr_pr_viewopt_sr')
        meta = response.meta
        meta['_current_star'] = {}
        for star in self.buyer_reviews_stars:
            args = {
                'asin': asin,
                'filterByStar': star,
                'filterByKeyword': '',
                'formatType': 'all_formats',
                'pageNumber': '1',
                'pageSize': '10',
                'sortBy': 'helpful',
                'reftag': 'cm_cr_pr_viewopt_sr',
                'reviewerType': 'all_reviews',
                'scope': 'reviewsAjax0',
            }
            meta['_current_star'] = star
            yield FormRequest(
                url=url,
                formdata=args,
                meta=meta,
                callback=self._get_rating_by_star_by_individual_request,
                dont_filter=True)

    def get_buyer_reviews_from_2nd_page(self, response):
        if self._has_captcha(response):
            return self._handle_captcha(response,
                                        self.get_buyer_reviews_from_2nd_page)
        product = response.meta["product"]
        buyer_reviews = {}
        product["buyer_reviews"] = {}
        buyer_reviews["num_of_reviews"] = is_empty(
            response.xpath(
                '//span[contains(@class, "totalReviewCount")]/text()').extract(
                ), '').replace(",", "")
        if not buyer_reviews['num_of_reviews']:
            buyer_reviews['num_of_reviews'] = ZERO_REVIEWS_VALUE
        average = is_empty(
            response.xpath(
                '//div[contains(@class, "averageStarRatingNumerical")]//span/text()'
            ).extract(), "")

        buyer_reviews["average_rating"] = \
            average.replace('out of 5 stars', '')

        buyer_reviews["rating_by_star"] = {}
        buyer_reviews = self.get_rating_by_star(response, buyer_reviews)[0]

        #print('*' * 20, 'parsing buyer reviews from', response.url)

        if not buyer_reviews.get('rating_by_star'):
            response.meta['product']['buyer_reviews'] = buyer_reviews
            # if still no rating_by_star (probably the rating is percent-based)
            return self._create_post_requests(
                response, self._get_asin_from_url(response.url))
            #return

        product["buyer_reviews"] = BuyerReviews(**buyer_reviews)

        meta = {"product": product}
        mkt_place_link = response.meta.get("mkt_place_link", None)
        if mkt_place_link:
            return Request(url=mkt_place_link,
                           callback=self.parse_marketplace,
                           meta=meta,
                           dont_filter=True)

        return product

    def _build_buyer_reviews(self, response):
        buyer_reviews = {}

        total = response.xpath('string(//*[@id="summaryStars"])').re(
            FLOATING_POINT_RGEX)
        if not total:
            total = response.xpath(
                'string(//div[@id="acr"]/div[@class="txtsmall"]'
                '/div[contains(@class, "acrCount")])').re(FLOATING_POINT_RGEX)
            if not total:
                return ZERO_REVIEWS_VALUE
        buyer_reviews['num_of_reviews'] = int(total[0].replace(',', ''))

        average = response.xpath('//*[@id="summaryStars"]/a/@title')
        if not average:
            average = response.xpath(
                '//div[@id="acr"]/div[@class="txtsmall"]'
                '/div[contains(@class, "acrRating")]/text()')
        average = average.extract()[0].replace('out of 5 stars', '')
        buyer_reviews['average_rating'] = float(average)

        buyer_reviews['rating_by_star'] = {}
        buyer_reviews, table = self.get_rating_by_star(response, buyer_reviews)

        if not buyer_reviews.get('rating_by_star'):
            # scrape new buyer reviews request (that will lead to a new page)
            buyer_rev_link = is_empty(response.xpath(
                '//div[@id="revSum"]//a[contains(text(), "See all")' \
                ' or contains(text(), "See the customer review")' \
                ' or contains(text(), "See both customer reviews")]/@href'
            ).extract())
            buyer_rev_req = Request(
                url=buyer_rev_link,
                callback=self.get_buyer_reviews_from_2nd_page)
            # now we can safely return Request
            #  because it'll be re-crawled in the `parse_product` method
            return buyer_rev_req

        return BuyerReviews(**buyer_reviews)

    def get_rating_by_star(self, response, buyer_reviews):
        table = response.xpath('//table[@id="histogramTable"]'
                               '/tr[@class="a-histogram-row"]')
        if table:
            for tr in table:  #td[last()]//text()').re('\d+')
                rating = is_empty(
                    tr.xpath('string(.//td[1])').re(FLOATING_POINT_RGEX))
                number = is_empty(
                    tr.xpath('string(.//td[last()])').re(FLOATING_POINT_RGEX))
                is_perc = is_empty(tr.xpath('string(.//td[last()])').extract())
                if "%" in is_perc:
                    break
                if number:
                    buyer_reviews['rating_by_star'][rating] = int(
                        number.replace(',', ''))
        else:
            table = response.xpath(
                '//div[@id="revH"]/div/div[contains(@class, "fl")]')
            for div in table:
                rating = div.xpath(
                    'string(.//div[contains(@class, "histoRating")])').re(
                        FLOATING_POINT_RGEX)[0]
                number = div.xpath(
                    'string(.//div[contains(@class, "histoCount")])').re(
                        FLOATING_POINT_RGEX)[0]
                buyer_reviews['rating_by_star'][rating] = int(
                    number.replace(',', ''))
        return buyer_reviews, table

    def _scrape_total_matches(self, response):
        # Where this value appears is a little weird and changes a bit so we
        # need two alternatives to capture it consistently.

        if response.css('#noResultsTitle'):
            return 0

        # Every result I saw is shown with this format
        #    1-16 of 424,831 results for
        #    2 results for
        values = response.css('#s-result-count ::text').re(
            '([0-9,]+)\s[Rr]esults for')
        if not values:
            # The first possible place is where it normally is in a fully
            # rendered page.
            values = response.css('#resultCount > span ::text').re(
                '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults')
            if not values:
                # Otherwise, it appears within a comment.
                values = response.css('#result-count-only-next').xpath(
                    'comment()').re('\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+')

        if values:
            total_matches = int(values[0].replace(',', ''))
        else:
            if not self.is_nothing_found(response):
                self.log("Failed to parse total number of matches for: %s" %
                         response.url,
                         level=ERROR)
            total_matches = None
        return total_matches

    def _scrape_results_per_page(self, response):
        num = response.xpath('//*[@id="s-result-count"]/text()').re(
            '1-(\d+) of')
        if num:
            return int(num[0])
        else:
            num = response.xpath('//*[@id="s-result-count"]/text()').re(
                '(\d+) results')
            if num:
                return int(num[0])
        return None

    def _scrape_product_links(self, response):
        lis = response.xpath("//div[@id='resultsCol']//ul//li |"
                             "//div[@id='mainResults']//ul//li"
                             "[contains(@id, 'result')] |"
                             "//div[@id='atfResults']//ul//li"
                             "[contains(@id, 'result')] |"
                             "//div[@id='mainResults']//div"
                             "[contains(@id, 'result')]")
        links = []
        last_idx = -1
        for li in lis:
            try:
                is_prime = li.xpath(
                    "*/descendant::i[contains(concat(' ',@class,' '),' a-icon-prime ')] |"
                    ".//span[contains(@class, 'sprPrime')]")
                is_prime_pantry = li.xpath(
                    "*/descendant::i[contains(concat(' ',@class,' '),' a-icon-prime-pantry ')]"
                )
                data_asin = li.xpath('@id').extract()[0]
                idx = int(re.findall(r'\d+', data_asin)[0])
                if idx > last_idx:
                    link = li.xpath(
                        ".//a[contains(@class,'s-access-detail-page')]/@href |"
                        ".//h3[@class='newaps']/a/@href").extract()[0]
                    if 'slredirect' in link:
                        link = urlparse.urljoin('http://amazon.com/', link)
                    links.append((link, is_prime, is_prime_pantry))
                else:
                    break
                last_idx = idx
            except IndexError:
                continue
        if len(links) < 1:
            self.log("Found no product links.", WARNING)

        for link, is_prime, is_prime_pantry in links:
            prime = None
            if is_prime:
                prime = 'Prime'
            if is_prime_pantry:
                prime = 'PrimePantry'
            yield link, SiteProductItem(prime=prime)

    def _scrape_next_results_page_link(self, response):
        next_pages = response.css('#pagnNextLink ::attr(href)').extract()
        next_page_url = None
        if len(next_pages) == 1:
            next_page_url = next_pages[0]
        elif len(next_pages) > 1:
            self.log("Found more than one 'next page' link.", ERROR)
        return next_page_url

    def _search_page_error(self, response):
        body = response.body_as_unicode()
        return "Your search" in body \
            and  "did not match any products." in body

    # Captcha handling functions.
    def _has_captcha(self, response):
        return '.images-amazon.com/captcha/' in response.body_as_unicode()

    def _solve_captcha(self, response):
        forms = response.xpath('//form')
        assert len(forms) == 1, "More than one form found."

        captcha_img = forms[0].xpath(
            '//img[contains(@src, "/captcha/")]/@src').extract()[0]

        self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG)
        return self._cbw.solve_captcha(captcha_img)

    def _handle_captcha(self, response, callback):
        # FIXME This is untested and wrong.
        captcha_solve_try = response.meta.get('captcha_solve_try', 0)
        url = response.url
        self.log("Captcha challenge for %s (try %d)." %
                 (url, captcha_solve_try),
                 level=INFO)

        captcha = self._solve_captcha(response)

        if captcha is None:
            self.log("Failed to guess captcha for '%s' (try: %d)." %
                     (url, captcha_solve_try),
                     level=ERROR)
            result = None
        else:
            self.log("On try %d, submitting captcha '%s' for '%s'." %
                     (captcha_solve_try, captcha, url),
                     level=INFO)
            meta = response.meta.copy()
            meta['captcha_solve_try'] = captcha_solve_try + 1
            result = FormRequest.from_response(
                response,
                formname='',
                formdata={'field-keywords': captcha},
                callback=callback,
                dont_filter=True,
                meta=meta)

        return result

    def _parse_single_product(self, response):
        return self.parse_product(response)

    def parse_marketplace(self, response):
        response.meta["called_class"] = self
        response.meta["next_req"] = None
        return self.mtp_class.parse_marketplace(response)

    def exit_point(self, product, next_req):
        if next_req:
            next_req.replace(meta={"product": product})
            return next_req
        return product

    def is_nothing_found(self, response):
        txt = response.xpath('//h1[@id="noResultsTitle"]/text()').extract()
        txt = ''.join(txt)
        return 'did not match any products' in txt

    def mkt_request(self, link, meta):
        return Request(url=link,
                       callback=self.parse_marketplace,
                       meta=meta,
                       dont_filter=True)