예제 #1
0
 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = Spider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         six.iteritems(crawler.settings.get('DEFAULT_REQUEST_HEADERS'))])
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = Spider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
예제 #3
0
class ChunkExtensionTest(object):
    settings = {}

    def tearDown(self):
        self.remove_temp_dir()

    def start(self, n_items_per_chunk=None, n_items=None, settings=None):

        # Reset item generator and remove temporary dir
        ItemGenerator.reset()
        self.remove_temp_dir()

        # Setup settings
        settings = settings or self.settings.copy()
        if n_items_per_chunk is not None:
            settings['CHUNKED_FEED_ITEMS_PER_CHUNK'] = n_items_per_chunk

        # Init Scrapy
        self.crawler = get_crawler(settings)
        self.spider = Spider('chunk_test')
        self.spider.set_crawler(self.crawler)
        self.extension = ChunkedFeedExporter.from_crawler(self.crawler)
        self.extension.open_spider(self.spider)

        # Add items if we have to
        if n_items:
            self.add_items(n_items)

    def stop(self):
        return self.extension.close_spider(self.spider)

    def remove_temp_dir(self):
        shutil.rmtree(EXPORT_TEMP_DIR, ignore_errors=True)

    def add_items(self, n_items):
        for i in range(n_items):
            item = ItemGenerator.generate()
            self.extension.item_scraped(item, self.spider)

    def get_chunk_filename(self, chunk):
        return EXPORT_FILE_PATTERN % {'chunk_number':chunk}

    def get_chunk_filenames(self):
        return [f for f in os.listdir(EXPORT_TEMP_DIR) if f.endswith(".json")]

    def get_number_of_chunks(self):
        return len(self.get_chunk_filenames())

    def get_chunk_content(self, chunk):
        with open(self.get_chunk_filename(chunk)) as f:
            return json.load(f)

    def ensure_number_of_chunks(self, n_chunks):
        n = self.get_number_of_chunks()
        assert n_chunks == n, "Wrong number of chunks. found %d, expecting %d" % (n, n_chunks)

    def ensure_number_of_exported_items_per_chunk(self, chunk, n_items):
        n_exported_items = len(self.get_chunk_content(chunk))
        assert n_items == n_exported_items, "Wrong number of exported items. found %d, expecting %d" % \
                                            (n_exported_items, n_items)
예제 #4
0
 def _export_streamitem(self, values):
     item = load_item_from_values(values)
     crawler = get_crawler()
     spider = Spider('streamitem_test')
     spider.set_crawler(crawler)
     storage = StreamItemFileFeedStorage(EXPORT_SC_FILENAME)
     exporter = StreamItemExporter(file=storage.open(spider))
     exporter.start_exporting()
     exporter.export_item(item)
     exporter.finish_exporting()
예제 #5
0
class ManagerTestCase(TestCase):

    settings_dict = None

    def setUp(self):
        self.crawler = get_crawler(self.settings_dict)
        self.spider = Spider('foo')
        self.spider.set_crawler(self.crawler)
        self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
        # some mw depends on stats collector
        self.crawler.stats.open_spider(self.spider)
        return self.mwman.open_spider(self.spider)

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        return self.mwman.close_spider(self.spider)

    def _download(self, request, response=None):
        """Executes downloader mw manager's download method and returns
        the result (Request or Response) or raise exception in case of
        failure.
        """
        if not response:
            response = Response(request.url)

        def download_func(**kwargs):
            return response

        dfd = self.mwman.download(download_func, request, self.spider)
        # catch deferred result and return the value
        results = []
        dfd.addBoth(results.append)
        self._wait(dfd)
        ret = results[0]
        if isinstance(ret, Failure):
            ret.raiseException()
        return ret
예제 #6
0
class ManagerTestCase(TestCase):

    settings_dict = None

    def setUp(self):
        self.crawler = get_crawler(self.settings_dict)
        self.spider = Spider('foo')
        self.spider.set_crawler(self.crawler)
        self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
        # some mw depends on stats collector
        self.crawler.stats.open_spider(self.spider)
        return self.mwman.open_spider(self.spider)

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        return self.mwman.close_spider(self.spider)

    def _download(self, request, response=None):
        """Executes downloader mw manager's download method and returns
        the result (Request or Response) or raise exception in case of
        failure.
        """
        if not response:
            response = Response(request.url)

        def download_func(**kwargs):
            return response

        dfd = self.mwman.download(download_func, request, self.spider)
        # catch deferred result and return the value
        results = []
        dfd.addBoth(results.append)
        self._wait(dfd)
        ret = results[0]
        if isinstance(ret, Failure):
            ret.raiseException()
        return ret
예제 #7
0
 def get_spider_and_mw(self, default_useragent):
     crawler = get_crawler({'USER_AGENT': default_useragent})
     spider = Spider('foo')
     spider.set_crawler(crawler)
     return spider, UserAgentMiddleware.from_crawler(crawler)
 def get_request_spider_mw(self):
     crawler = get_crawler()
     spider = Spider('foo')
     spider.set_crawler(crawler)
     request = Request('http://scrapytest.org/')
     return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)
 def get_request_spider_mw(self):
     crawler = get_crawler()
     spider = Spider("foo")
     spider.set_crawler(crawler)
     request = Request("http://scrapytest.org/")
     return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)