def test_from_crawler_missing_arguments(api_url, api_key): spider = spider_with_crawler() spider.crawler.settings['KINGFISHER_API_URI'] = api_url spider.crawler.settings['KINGFISHER_API_KEY'] = api_key with pytest.raises(NotConfigured) as excinfo: KingfisherProcessAPI.from_crawler(spider.crawler) assert str(excinfo.value) == 'KINGFISHER_API_URI and/or KINGFISHER_API_KEY is not set.'
def test_spider_closed_other_reason(tmpdir): spider = spider_with_files_store(tmpdir) extension = KingfisherProcessAPI.from_crawler(spider.crawler) response = yield extension.spider_closed(spider, 'xxx') assert response is None
def test_spider_closed(sample, is_sample, ok, tmpdir, caplog): with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked: mocked.return_value = 200 if ok else 400 spider = spider_with_files_store(tmpdir, sample=sample) extension = KingfisherProcessAPI.from_crawler(spider.crawler) response = yield extension.spider_closed( spider, 'sample' if is_sample else 'finished') data = yield response.json() form = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': str(is_sample), } assert data['method'] == 'POST' assert data[ 'url'] == 'http://httpbin.org/anything/api/v1/submit/end_collection_store/' assert data['headers']['Authorization'] == 'ApiKey xxx' assert data['form'] == form assert data['args'] == {} assert data['data'] == '' assert data['files'] == {} if not ok: message = 'end_collection_store failed (test) with status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message
def test_spider_closed(sample, is_sample, ok, tmpdir, caplog): spider = spider_with_files_store(tmpdir, sample=sample) api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) with patch('requests.post') as mocked: response = Mock() response.ok = ok response.status_code = 400 mocked.return_value = response api_extension.spider_closed(spider, 'finished') mocked.assert_called_once_with( 'http://httpbin.org/anything/api/v1/submit/end_collection_store/', headers={ 'Authorization': 'ApiKey xxx', }, data={ 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, }, ) if not ok: assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == 'Failed to post End Collection Store. API status code: 400'
def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok, tmpdir, caplog): spider = spider_with_files_store(tmpdir, sample=sample, note=note) api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) with patch('requests.post') as mocked: response = Mock() response.ok = ok response.status_code = 400 mocked.return_value = response kwargs = {} if encoding: kwargs['encoding'] = encoding item = spider.build_file_item( number=1, file_name='data.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', encoding=encoding2, ) api_extension.item_scraped(item, spider) if not ok: message = 'Failed to post [https://example.com/remote.json]. API status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message expected = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, 'file_name': 'data.json', 'url': 'https://example.com/remote.json', # Specific to this test case. 'data_type': 'release_package', 'encoding': encoding2, 'number': 1, 'data': b'{"key": "value"}', } if note: expected['collection_note'] = note mocked.assert_called_once_with( 'http://httpbin.org/anything/api/v1/submit/item/', headers={ 'Authorization': 'ApiKey xxx', }, proxies={ 'http': None, 'https': None, }, data=expected, )
def test_spider_closed_exception(tmpdir, caplog): with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked: mocked.side_effect = ExpectedError spider = spider_with_files_store(tmpdir) extension = KingfisherProcessAPI.from_crawler(spider.crawler) with pytest.raises(ExpectedError): yield extension.spider_closed(spider, 'finished')
def test_spider_closed_other_reason(tmpdir): spider = spider_with_files_store(tmpdir) api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) with patch('requests.post') as mocked: api_extension.spider_closed(spider, 'xxx') mocked.assert_not_called()
def test_from_crawler(): spider = spider_with_crawler() spider.crawler.settings['KINGFISHER_API_URI'] = 'http://httpbin.org/anything' spider.crawler.settings['KINGFISHER_API_KEY'] = 'xxx' spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = 'localdir' api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) assert api_extension.directory == 'localdir'
def test_from_crawler(): spider = spider_with_crawler(settings={ 'KINGFISHER_API_URI': 'http://httpbin.org/anything', 'KINGFISHER_API_KEY': 'xxx', 'KINGFISHER_API_LOCAL_DIRECTORY': 'localdir', }) extension = KingfisherProcessAPI.from_crawler(spider.crawler) assert extension.directory == 'localdir'
def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok, tmpdir, caplog): with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked: mocked.return_value = 200 if ok else 400 spider = spider_with_files_store(tmpdir, sample=sample, note=note) extension = KingfisherProcessAPI.from_crawler(spider.crawler) kwargs = {} if encoding: kwargs['encoding'] = encoding item = spider.build_file_item(number=1, file_name='data.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', **kwargs) response = yield extension.item_scraped(item, spider) data = yield response.json() form = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': str(is_sample), 'file_name': 'data.json', 'url': 'https://example.com/remote.json', # Specific to FileItem. 'data_type': 'release_package', 'encoding': encoding2, 'number': '1', 'data': '{"key": "value"}', } if note: form['collection_note'] = note assert data['method'] == 'POST' assert data['url'] == 'http://httpbin.org/anything/api/v1/submit/item/' assert data['headers']['Authorization'] == 'ApiKey xxx' assert data['form'] == form assert data['args'] == {} assert data['data'] == '' assert data['files'] == {} if not ok: message = 'create_file_item failed (https://example.com/remote.json) with status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message
def test_item_scraped_file_error(sample, is_sample, ok, tmpdir, caplog): spider = spider_with_files_store(tmpdir, sample=sample) api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) with patch('requests.post') as mocked: response = Mock() response.ok = ok response.status_code = 400 mocked.return_value = response data = FileError({ 'file_name': 'file.json', 'url': 'https://example.com/remote.json', 'errors': { 'http_code': 500 }, }) api_extension.item_scraped(data, spider) if not ok: message = 'Failed to post [https://example.com/remote.json]. File Errors API status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message expected = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, 'file_name': 'file.json', 'url': 'https://example.com/remote.json', # Specific to this test case. 'errors': '{"http_code": 500}', } mocked.assert_called_once_with( 'http://httpbin.org/anything/api/v1/submit/file_errors/', headers={ 'Authorization': 'ApiKey xxx', }, proxies={ 'http': None, 'https': None, }, data=expected, )
def test_item_scraped_file_error(sample, is_sample, ok, tmpdir, caplog): with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked: mocked.return_value = 200 if ok else 400 spider = spider_with_files_store(tmpdir, sample=sample) extension = KingfisherProcessAPI.from_crawler(spider.crawler) item = FileError({ 'file_name': 'file.json', 'url': 'https://example.com/remote.json', 'errors': { 'http_code': 500 }, }) response = yield extension.item_scraped(item, spider) data = yield response.json() form = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': str(is_sample), 'file_name': 'file.json', 'url': 'https://example.com/remote.json', # Specific to FileError. 'errors': '{"http_code": 500}', } assert data['method'] == 'POST' assert data[ 'url'] == 'http://httpbin.org/anything/api/v1/submit/file_errors/' assert data['headers']['Authorization'] == 'ApiKey xxx' assert data['form'] == form assert data['args'] == {} assert data['data'] == '' assert data['files'] == {} if not ok: message = 'create_file_error failed (https://example.com/remote.json) with status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, crawl_time, tmpdir, caplog): with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked: mocked.return_value = 200 if ok else 400 settings = {} if directory: settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str( tmpdir.join('xxx')) spider = spider_with_files_store(tmpdir, settings=settings, sample=sample, note=note, crawl_time=crawl_time) extension = KingfisherProcessAPI.from_crawler(spider.crawler) kwargs = {} if encoding: kwargs['encoding'] = encoding item = spider.build_file( file_name='file.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', post_to_api=post_to_api, **kwargs, ) store_extension = KingfisherFilesStore.from_crawler(spider.crawler) store_extension.item_scraped(item, spider) response = yield extension.item_scraped(item, spider) if post_to_api: data = yield response.json() form = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': str(is_sample), 'file_name': 'file.json', 'url': 'https://example.com/remote.json', # Specific to File. 'data_type': 'release_package', 'encoding': encoding2, } if note: form['collection_note'] = note if crawl_time: form['collection_data_version'] = '2020-01-01 00:00:00' path = path.replace('20010203_040506', '20200101_000000') if directory: form['local_file_name'] = tmpdir.join('xxx', path) with open(tmpdir.join(path)) as f: assert data['method'] == 'POST' assert data[ 'url'] == 'http://httpbin.org/anything/api/v1/submit/file/' assert data['headers']['Authorization'] == 'ApiKey xxx' assert data['form'] == form assert data['args'] == {} assert data['data'] == '' if directory: assert data['files'] == {} else: assert data['files'] == {'file': f.read()} else: assert response is None if not ok: if post_to_api: message = 'create_file failed (https://example.com/remote.json) with status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message else: assert len(caplog.records) == 0
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, tmpdir, caplog): spider = spider_with_files_store(tmpdir, sample=sample, note=note) if directory: spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx')) store_extension = KingfisherFilesStore.from_crawler(spider.crawler) api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) kwargs = {} if encoding: kwargs['encoding'] = encoding item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json', data_type='release_package', post_to_api=post_to_api, **kwargs) store_extension.item_scraped(item, spider) with patch('requests.post') as mocked: response = Mock() response.ok = ok response.status_code = 400 mocked.return_value = response api_extension.item_scraped(item, spider) if not ok: if not post_to_api: assert len(caplog.records) == 0 else: message = 'Failed to post [https://example.com/remote.json]. API status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message expected = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, 'file_name': 'file.json', 'url': 'https://example.com/remote.json', # Specific to this test case. 'data_type': 'release_package', 'encoding': encoding2, } if note: expected['collection_note'] = note if directory: expected['local_file_name'] = tmpdir.join('xxx', path) if not post_to_api: assert mocked.call_count == 0 else: with open(tmpdir.join(path), 'rb') as f: assert mocked.call_count == 1 assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',) assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'} assert mocked.call_args[1]['data'] == expected assert len(mocked.call_args[1]) == 3 if directory: assert mocked.call_args[1]['files'] == {} else: assert len(mocked.call_args[1]['files']) == 1 assert len(mocked.call_args[1]['files']['file']) == 3 assert mocked.call_args[1]['files']['file'][0] == 'file.json' assert mocked.call_args[1]['files']['file'][1].read() == f.read() assert mocked.call_args[1]['files']['file'][2] == 'application/json'