예제 #1
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        stats = spider.crawler.stats
        if retries <= self.max_retry_times:
            spider.logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            spider.logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            response = Response('')
            response.replace(body="")
            response.status = 12138
            return response
예제 #2
0
def test_spider_crawls_links(spider, scrape_request, html_headers,
                             mock_html_twolinks):
    """Ensure spider always picks up relevant links to HTML pages"""
    # Use only 1 user agent for easier counting
    ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    spider.batch_user_agents = [ua]

    # Generate a mock response based on html containing two links
    mock_response = Response('http://test:12345',
                             body=mock_html_twolinks)
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.meta['user_agent'] = ua
    mock_response.status = 200
    mock_response.encoding = u'utf-8'
    mock_response.flags = []

    # Call spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # Assert that we got the expected set of new requests generated in the
    # spider and nothing else
    sites_expected = set([
            mock_response.url + '/link1.html',
            mock_response.url + '/link2.html',
            ])

    sites_collected = []
    for new_request in pipeline_generator:
        if isinstance(new_request, Request):
            sites_collected.append(new_request.url)
        else:
            pass

    assert sites_expected == set(sites_collected)
예제 #3
0
def test_useragents_spider(spider, scrape_request, html_headers,
                           mock_html_nolinks):
    """Ensure multiple requests with different user agent strings emitted"""
    ua1 = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    ua2 = factories.BatchUserAgentFactory.build(ua_string='Chrome / 20.0')
    spider.batch_user_agents = [ua1, ua2]

    # Generate a mock response
    mock_response = Response('http://test:12345',
                             body=mock_html_nolinks)
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.status = 200
    mock_response.encoding = u'utf-8'
    mock_response.flags = []

    # Call the spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # Assert that we have two requests for this linkless page, one for each
    # of the user agents we inserted
    request_uas = []
    for new_request in pipeline_generator:
        if isinstance(new_request, Request):
            request_uas.append(new_request.meta['user_agent'].ua_string)
        else:
            # We're not expecting anything other than Requests
            assert False

    assert set(request_uas) == set([u'Firefox / 11.0', u'Chrome / 20.0'])
예제 #4
0
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js):
    """JS items are emitted correctly"""
    # Generate a mock response based on JS
    mock_url = 'http://test:12345/default.js'
    mock_response = Response(mock_url, body=mock_js)
    mock_response.request = linked_js_request
    mock_response.headers = js_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_js_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(js_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(js_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_js_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']
    item_expected['redirected_from'] = ''

    assert list(pipeline_generator) == [item_expected]
예제 #5
0
def test_css_item_emission(spider, linked_css_request, css_headers, mock_css):
    """CSS items are emitted correctly"""
    # Use only 1 user agent for easier counting
    ua1 = factories.BatchUserAgentFactory(ua_string='Firefox / 11.0')
    spider.user_agents = [ua1]

    # Generate a mock response based on CSS
    mock_url = 'http://test:12345/default.css'
    mock_response = Response(mock_url,
                             body=mock_css)
    mock_response.request = linked_css_request
    mock_response.headers = css_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_css_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(css_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(css_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_css_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']

    item_collected = None
    for item in pipeline_generator:
        if isinstance(item, MarkupItem):
            item_collected = item
        else:
            assert False

    assert item_expected == item_collected
예제 #6
0
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js):
    """JS items are emitted correctly"""
    # Generate a mock response based on JS
    mock_url = 'http://test:12345/default.js'
    mock_response = Response(mock_url,
                             body=mock_js)
    mock_response.request = linked_js_request
    mock_response.headers = js_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_js_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(js_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(js_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_js_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']
    item_expected['redirected_from'] = ''

    assert list(pipeline_generator) == [item_expected]