def test_store(self): out = BytesIO() storage = StdoutFeedStorage('stdout:', _stdout=out) file = storage.open(scrapy.Spider("default")) file.write(b"content") yield storage.store(file) self.assertEqual(out.getvalue(), b"content")
def test_get_exporter_local_file(self): pipeline = PodcastPipeline() pipeline.process_item(PodcastDataItem(), None) spider = scrapy.Spider(name="test_spider", settings={}) spider.settings['OUTPUT_URI'] = './local-file.xml' exporter = pipeline._get_exporter(spider) self.assertIsInstance(exporter, PodcastToFileItemExporter)
def test_get_exporter_s3_file(self): pipeline = PodcastPipeline() pipeline.process_item(PodcastDataItem(), None) spider = scrapy.Spider(name="test_spider", settings={}) spider.settings['OUTPUT_URI'] = 's3://my-bucket/my-podcast.xml' exporter = pipeline._get_exporter(spider) self.assertIsInstance(exporter, PodcastToS3ItemExporter)
def _assert_stores(self, storage, path): spider = scrapy.Spider("default") file = storage.open(spider) file.write(b"content") yield storage.store(file) self.assertTrue(os.path.exists(path)) with open(path, 'rb') as fp: self.assertEqual(fp.read(), b"content")
def test_add_spider_name(): pipeline = AddSpiderNamePipeline() spider = scrapy.Spider(name='my-spider-name') item = BasicItem() item['id'] = 'my-unique-id' item['url'] = 'http://example.com/' item['source'] = 'dummy source' processed_item = pipeline.process_item(item, spider) assert processed_item['spider'] == 'my-spider-name'
def _assert_stores(self, storage, path): spider = scrapy.Spider("default") file = storage.open(spider) file.write(b"content") yield storage.store(file) self.assertTrue(os.path.exists(path)) with open(path, 'rb') as fp: self.assertEqual(fp.read(), b"content") # again, to check s3 objects are overwritten yield storage.store(BytesIO(b"new content")) with open(path, 'rb') as fp: self.assertEqual(fp.read(), b"new content")
def test_add_namespace(): # TODO: This test should also test from_crawler if possible pipeline = AddNamespacePipeline('my-namespace') spider = scrapy.Spider(name='my-spider-name') item = BasicItem() item['id'] = 'my-unique-id' item['url'] = 'http://example.com/' item['source'] = 'dummy source' processed_item = pipeline.process_item(item, spider) assert processed_item['namespace'] == 'my-namespace'
def test_store(self): assert_aws_environ() uri = os.environ.get('S3_TEST_FILE_URI') if not uri: raise unittest.SkipTest("No S3 URI available for testing") storage = S3FeedStorage(uri) verifyObject(IFeedStorage, storage) file = storage.open(scrapy.Spider("default")) expected_content = b"content: \xe2\x98\x83" file.write(expected_content) yield storage.store(file) u = urlparse(uri) content = get_s3_content_and_delete(u.hostname, u.path[1:]) self.assertEqual(content, expected_content)
def test_store(self): assert_aws_environ() uri = os.environ.get('FEEDTEST_S3_URI') if not uri: raise unittest.SkipTest("No S3 URI available for testing") from boto import connect_s3 storage = S3FeedStorage(uri) verifyObject(IFeedStorage, storage) file = storage.open(scrapy.Spider("default")) file.write("content") yield storage.store(file) u = urlparse(uri) key = connect_s3().get_bucket(u.hostname, validate=False).get_key(u.path) self.assertEqual(key.get_contents_as_string(), "content")
def test_add_crawl_time(): pipeline = AddCrawlTimePipeline() spider = scrapy.Spider(name='my-spider-name') item = BasicItem() item['id'] = 'my-unique-id' item['url'] = 'http://example.com/' item['source'] = 'dummy source' processed_item = pipeline.process_item(item, spider) crawl_time = datetime.datetime.strptime(processed_item['crawl_time'], '%Y-%m-%dT%H:%M:%SZ') shortly_ago = datetime.datetime.utcnow() - datetime.timedelta(seconds=10) shortly_after = datetime.datetime.utcnow() + datetime.timedelta(seconds=10) assert shortly_ago < crawl_time < shortly_after
def __init__(self, domain): """ domain_list: a list of domains to extract links from (must start with http://) """ self.domain = domain self.scrapy = scrapy.Spider(self.domain) self.scrapy.allowed_domains = self.scrapy.name self.scrapy.start_urls = [self.scrapy.name] self.settings = get_project_settings() self.crawler = Crawler(settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.crawler.configure() self.scrapy.crawl(spider) self.scrapy.start() log.start() reactor.run() self.links = []
VIEWSTATE = selector.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first() EVENTVALIDATION = selector.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first() VIEWSTATEGENERATOR = selector.xpath( '//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first() script = 'ctl00$ContentPlaceHolder1$updatepanelread|ctl00$ContentPlaceHolder1$ddsection' formdata = { # change pages here "ctl00$ContentPlaceHolder1$script": script, "ctl00$ContentPlaceHolder1$ddbook": "1", "ctl00$ContentPlaceHolder1$ddsection": "1", "__EVENTTARGET": "ctl00$ContentPlaceHolder1$ddsection", "__EVENTARGUMENT": "", "__LASTFOCUS": "", "__VIEWSTATE": VIEWSTATE, "__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR, "__EVENTVALIDATION": EVENTVALIDATION, #"ctl00$ContentPlaceHolder1$ddchapter":'', "__ASYNCPOST": "true&", #"ScriptManager.SupportsPartialRendering": "true", #"ctl00$ContentPlaceHolder1$btnSearch": "Search" } yield {'data': response.text} spider = AyuSpider(scrapy.Spider("AyuSpider")) spider.start_requests()
def test_cache_args(): spider = scrapy.Spider(name='foo') mw = _get_mw() mw.crawler.spider = spider mw.spider_opened(spider) dedupe_mw = SplashDeduplicateArgsMiddleware() # ========= Send first request - it should use save_args: lua_source = 'function main(splash) end' req = SplashRequest('http://example.com/foo', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) assert req.meta['splash']['args']['lua_source'] == lua_source # <---- spider req, = list(dedupe_mw.process_start_requests([req], spider)) # ----> scheduler assert req.meta['splash']['args']['lua_source'] != lua_source assert list(mw._argument_values.values()) == [lua_source] assert list(mw._argument_values.keys()) == [req.meta['splash']['args']['lua_source']] # <---- scheduler # process request before sending it to the downloader req = mw.process_request(req, spider) or req # -----> downloader assert req.meta['splash']['args']['lua_source'] == lua_source assert req.meta['splash']['args']['save_args'] == ['lua_source'] assert 'load_args' not in req.meta['splash']['args'] assert req.meta['splash']['_local_arg_fingerprints'] == { 'lua_source': list(mw._argument_values.keys())[0] } # <---- downloader resp_body = b'{}' resp = TextResponse("http://example.com", headers={ b'Content-Type': b'application/json', b'X-Splash-Saved-Arguments': b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3' }, body=resp_body) resp = mw.process_response(req, resp, None) # ============ Send second request - it should use load_args req2 = SplashRequest('http://example.com/bar', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) req2, item = list(dedupe_mw.process_spider_output(resp, [req2, {'key': 'value'}], spider)) assert item == {'key': 'value'} # ----> scheduler assert req2.meta['splash']['args']['lua_source'] != lua_source # <---- scheduler # process request before sending it to the downloader req2 = mw.process_request(req2, spider) or req2 # -----> downloader assert req2.meta['splash']['args']['load_args'] == {"lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3"} assert "lua_source" not in req2.meta['splash']['args'] assert "save_args" not in req2.meta['splash']['args'] assert json.loads(req2.body.decode('utf8')) == { 'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'}, 'url': 'http://example.com/bar' } # <---- downloader resp = TextResponse("http://example.com/bar", headers={b'Content-Type': b'application/json'}, body=b'{}') resp = mw.process_response(req, resp, spider) # =========== Third request is dispatched to another server where # =========== arguments are expired: req3 = SplashRequest('http://example.com/baz', endpoint='execute', args={'lua_source': lua_source}, cache_args=['lua_source']) req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider)) # ----> scheduler assert req3.meta['splash']['args']['lua_source'] != lua_source # <---- scheduler req3 = mw.process_request(req3, spider) or req3 # -----> downloader assert json.loads(req3.body.decode('utf8')) == { 'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'}, 'url': 'http://example.com/baz' } # <---- downloader resp_body = json.dumps({ "type": "ExpiredArguments", "description": "Arguments stored with ``save_args`` are expired", "info": {"expired": ["html"]}, "error": 498 }) resp = TextResponse("127.0.0.1:8050", headers={b'Content-Type': b'application/json'}, status=498, body=resp_body.encode('utf8')) req4 = mw.process_response(req3, resp, spider) assert isinstance(req4, SplashRequest) # process this request again req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider)) req4 = mw.process_request(req4, spider) or req4 # it should become save_args request after all middlewares assert json.loads(req4.body.decode('utf8')) == { 'lua_source': 'function main(splash) end', 'save_args': ['lua_source'], 'url': 'http://example.com/baz' } assert mw._remote_keys == {}
def test_magic_response_caching(tmpdir): # prepare middlewares spider = scrapy.Spider(name='foo') crawler = _get_crawler({ 'HTTPCACHE_DIR': str(tmpdir.join('cache')), 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage', 'HTTPCACHE_ENABLED': True }) cache_mw = HttpCacheMiddleware.from_crawler(crawler) mw = _get_mw() cookie_mw = _get_cookie_mw() def _get_req(): return SplashRequest( url="http://example.com", endpoint='execute', magic_response=True, args={'lua_source': 'function main(splash) end'}, ) # Emulate Scrapy middleware chain. # first call req = _get_req() req = cookie_mw.process_request(req, spider) or req req = mw.process_request(req, spider) req = cache_mw.process_request(req, spider) or req assert isinstance(req, scrapy.Request) # first call; the cache is empty resp_data = { 'html': "<html><body>Hello</body></html>", 'render_time': 0.5, } resp_body = json.dumps(resp_data).encode('utf8') resp = TextResponse("http://example.com", headers={b'Content-Type': b'application/json'}, body=resp_body) resp2 = cache_mw.process_response(req, resp, spider) resp3 = mw.process_response(req, resp2, spider) resp3 = cookie_mw.process_response(req, resp3, spider) assert resp3.text == "<html><body>Hello</body></html>" assert resp3.css("body").extract_first() == "<body>Hello</body>" assert resp3.data['render_time'] == 0.5 # second call req = _get_req() req = cookie_mw.process_request(req, spider) or req req = mw.process_request(req, spider) cached_resp = cache_mw.process_request(req, spider) or req # response should be from cache: assert cached_resp.__class__ is TextResponse assert cached_resp.body == resp_body resp2_1 = cache_mw.process_response(req, cached_resp, spider) resp3_1 = mw.process_response(req, resp2_1, spider) resp3_1 = cookie_mw.process_response(req, resp3_1, spider) assert isinstance(resp3_1, scrapy_splash.SplashJsonResponse) assert resp3_1.body == b"<html><body>Hello</body></html>" assert resp3_1.text == "<html><body>Hello</body></html>" assert resp3_1.css("body").extract_first() == "<body>Hello</body>" assert resp3_1.data['render_time'] == 0.5 assert resp3_1.headers[b'Content-Type'] == b'text/html; charset=utf-8'
import scrapy spider = scrapy.Spider() spider.crawler # This is a sample Python script. # Press ⌃R to execute it or replace it with your code. # Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. def print_hi(name): # Use a breakpoint in the code line below to debug your script. print(f'Hi, {name}') # Press ⌘F8 to toggle the breakpoint. # Press the green button in the gutter to run the script. if __name__ == '__main__': print_hi('PyCharm') # See PyCharm help at https://www.jetbrains.com/help/pycharm/
def test_no_sticky_output(self): out = list( self.mw.process_spider_output(self.response, [scrapy.Request(URL)], scrapy.Spider('foo'))) self.assertEqual(out[0].meta.get('cookiejar', None), None)