def test_from_crawler_missing_arguments(): spider = spider_with_crawler() with pytest.raises(NotConfigured) as excinfo: KingfisherFilesStore.from_crawler(spider.crawler) assert str(excinfo.value) == 'FILES_STORE is not set.'
def test_item_scraped_with_build_file_item(): with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider = spider_with_crawler(settings={'FILES_STORE': files_store}) extension = KingfisherFilesStore.from_crawler(spider.crawler) assert extension.item_scraped(spider.build_file_item(), spider) is None
def test_item_scraped_with_build_file(sample, path, tmpdir): spider = spider_with_files_store(tmpdir, sample=sample) store_extension = KingfisherFilesStore.from_crawler(spider.crawler) data = b'{"key": "value"}' url = 'https://example.com/remote.json' item = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: assert f.read() == '{"key": "value"}' with open(tmpdir.join(path + '.fileinfo')) as f: assert json.load(f) == { 'url': 'https://example.com/remote.json', 'data_type': 'release_package', 'encoding': 'iso-8859-1', } assert item['path'] == path assert item['files_store'] == tmpdir
def test_build_file_with_existing_directory(): spider = spider_with_crawler() with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store store_extension = KingfisherFilesStore.from_crawler(spider.crawler) os.makedirs(os.path.join(files_store, 'test', '20010203_040506')) # No FileExistsError exception. store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider)
def test_item_scraped_with_build_file_and_existing_directory(): with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider = spider_with_crawler(settings={'FILES_STORE': files_store}) extension = KingfisherFilesStore.from_crawler(spider.crawler) item = spider.build_file(file_name='file.json', data=b'{"key": "value"}') os.makedirs(os.path.join(files_store, 'test', '20010203_040506')) # No FileExistsError exception. extension.item_scraped(item, spider)
def test_item_scraped_with_file_and_file_item(sample, directory, data, item, expected_file_name, tmpdir): spider = spider_with_files_store(tmpdir, sample=sample) extension = KingfisherFilesStore.from_crawler(spider.crawler) path = os.path.join(directory, expected_file_name) original_file_name = item['file_name'] item['data'] = data extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: assert f.read() == '{"key": "value"}' assert item['path'] == path assert item['files_store'] == tmpdir assert item['file_name'] == original_file_name
def test_item_scraped_with_build_file_from_response(sample, path, tmpdir): spider = spider_with_files_store(tmpdir, sample=sample) extension = KingfisherFilesStore.from_crawler(spider.crawler) response = Mock() response.body = b'{"key": "value"}' response.request = Mock() response.request.url = 'https://example.com/remote.json' response.request.meta = {'file_name': 'file.json'} item = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', encoding='iso-8859-1') extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: assert f.read() == '{"key": "value"}' assert item['path'] == path assert item['files_store'] == tmpdir
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, crawl_time, tmpdir, caplog): with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked: mocked.return_value = 200 if ok else 400 settings = {} if directory: settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str( tmpdir.join('xxx')) spider = spider_with_files_store(tmpdir, settings=settings, sample=sample, note=note, crawl_time=crawl_time) extension = KingfisherProcessAPI.from_crawler(spider.crawler) kwargs = {} if encoding: kwargs['encoding'] = encoding item = spider.build_file( file_name='file.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', post_to_api=post_to_api, **kwargs, ) store_extension = KingfisherFilesStore.from_crawler(spider.crawler) store_extension.item_scraped(item, spider) response = yield extension.item_scraped(item, spider) if post_to_api: data = yield response.json() form = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': str(is_sample), 'file_name': 'file.json', 'url': 'https://example.com/remote.json', # Specific to File. 'data_type': 'release_package', 'encoding': encoding2, } if note: form['collection_note'] = note if crawl_time: form['collection_data_version'] = '2020-01-01 00:00:00' path = path.replace('20010203_040506', '20200101_000000') if directory: form['local_file_name'] = tmpdir.join('xxx', path) with open(tmpdir.join(path)) as f: assert data['method'] == 'POST' assert data[ 'url'] == 'http://httpbin.org/anything/api/v1/submit/file/' assert data['headers']['Authorization'] == 'ApiKey xxx' assert data['form'] == form assert data['args'] == {} assert data['data'] == '' if directory: assert data['files'] == {} else: assert data['files'] == {'file': f.read()} else: assert response is None if not ok: if post_to_api: message = 'create_file failed (https://example.com/remote.json) with status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message else: assert len(caplog.records) == 0
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, tmpdir, caplog): spider = spider_with_files_store(tmpdir, sample=sample, note=note) if directory: spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx')) store_extension = KingfisherFilesStore.from_crawler(spider.crawler) api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) kwargs = {} if encoding: kwargs['encoding'] = encoding item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json', data_type='release_package', post_to_api=post_to_api, **kwargs) store_extension.item_scraped(item, spider) with patch('requests.post') as mocked: response = Mock() response.ok = ok response.status_code = 400 mocked.return_value = response api_extension.item_scraped(item, spider) if not ok: if not post_to_api: assert len(caplog.records) == 0 else: message = 'Failed to post [https://example.com/remote.json]. API status code: 400' assert len(caplog.records) == 1 assert caplog.records[0].name == 'test' assert caplog.records[0].levelname == 'WARNING' assert caplog.records[0].message == message expected = { 'collection_source': 'test', 'collection_data_version': '2001-02-03 04:05:06', 'collection_sample': is_sample, 'file_name': 'file.json', 'url': 'https://example.com/remote.json', # Specific to this test case. 'data_type': 'release_package', 'encoding': encoding2, } if note: expected['collection_note'] = note if directory: expected['local_file_name'] = tmpdir.join('xxx', path) if not post_to_api: assert mocked.call_count == 0 else: with open(tmpdir.join(path), 'rb') as f: assert mocked.call_count == 1 assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',) assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'} assert mocked.call_args[1]['data'] == expected assert len(mocked.call_args[1]) == 3 if directory: assert mocked.call_args[1]['files'] == {} else: assert len(mocked.call_args[1]['files']) == 1 assert len(mocked.call_args[1]['files']['file']) == 3 assert mocked.call_args[1]['files']['file'][0] == 'file.json' assert mocked.call_args[1]['files']['file'][1].read() == f.read() assert mocked.call_args[1]['files']['file'][2] == 'application/json'