def test_duplicate_file(caplog): pipeline = Validate() spider = spider_with_crawler() item = File({ 'file_name': 'test1', 'data': 'data', 'data_type': 'release_package', 'url': 'http://example.com', }) pipeline.process_item(item, spider) pipeline.process_item(item, spider) item2 = item.copy() item2['file_name'] = 'file2' pipeline.process_item(item2, spider) assert len(caplog.messages) == 1 assert caplog.messages[0] == "Duplicate File: 'test1'"
def test_process_file_without_sample(): pipeline = Sample() spider = spider_with_crawler() item = File({ 'file_name': 'test', 'data': 'data', 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == item
def test_process_item_xlsx(): spider = spider_with_crawler(unflatten=True) pipeline = Unflatten() item = File({ 'file_name': 'test.xlsx', 'data': save_virtual_workbook(Workbook()), 'data_type': 'release_package', 'url': 'http://test.com/test.xlsx', }) assert pipeline.process_item(item, spider) == item
def test_process_item_csv(): spider = spider_with_crawler(unflatten=True) pipeline = Unflatten() item = File({ 'file_name': 'test.csv', 'data': b'data', 'data_type': 'release_package', 'url': 'http://test.com/test.csv', }) assert pipeline.process_item(item, spider) == item
def test_disabled(data_type, data): spider = spider_with_crawler() pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(data), 'data_type': data_type, 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == item
def test_process_item_xlsx_error(): spider = spider_with_crawler(unflatten=True) pipeline = Unflatten() item = File({ 'file_name': 'test.xlsx', 'data': b'data', 'data_type': 'release_package', 'url': 'http://test.com/test.xlsx', }) with pytest.raises(BadXLSXZipFile): pipeline.process_item(item, spider)
def test_process_item_extension_error(): spider = spider_with_crawler(unflatten=True) pipeline = Unflatten() item = File({ 'file_name': 'file', 'data': b'data', 'data_type': 'release_package', 'url': 'http://test.com/file', }) with pytest.raises(NotImplementedError): pipeline.process_item(item, spider)
def parse(self, response): archive_name, archive_format = get_file_name_and_extension( response.request.meta['file_name']) if archive_format == 'zip': cls = ZipFile elif archive_format == 'rar': cls = RarFile else: raise UnknownArchiveFormatError(response.request.meta['file_name']) # If we use a context manager here, the archive file might close before the item pipeline reads from the file # handlers of the compressed files. archive_file = cls(BytesIO(response.body)) number = 1 for file_info in archive_file.infolist(): # Avoid reading the rest of a large file, since the rest of the items will be dropped. if self.sample and number > self.sample: break filename = file_info.filename basename = os.path.basename(filename) if self.file_name_must_contain not in basename: continue if archive_format == 'rar' and file_info.isdir(): continue if archive_format == 'zip' and file_info.is_dir(): continue if not basename.endswith('.json'): basename += '.json' compressed_file = archive_file.open(filename) # If `resize_package = True`, then we need to open the file twice: once to extract the package metadata and # then to extract the releases themselves. if self.resize_package: data = { 'data': compressed_file, 'package': archive_file.open(filename) } else: data = compressed_file yield File({ 'file_name': basename, 'data': data, 'data_type': self.data_type, 'url': response.request.url, 'encoding': self.encoding }) number += 1
def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True): """ Returns a File item to yield. """ return File({ 'file_name': file_name, 'data': data, 'data_type': data_type, 'url': url, 'encoding': encoding, 'post_to_api': post_to_api, })
def test_process_item_non_package_data_type(): spider = spider_with_crawler(package_pointer='/publishedDate') pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(releases[0]), 'data_type': 'release', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': 'error: /publishedDate not found'})
def test_process_item_release_pointer(data_type, data): spider = spider_with_crawler(release_pointer='/date', truncate=10) pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(data), 'data_type': data_type, 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': '2020-10-01'})
def test_process_item_package_pointer(data_type, data): spider = spider_with_crawler(package_pointer='/publishedDate') pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(data), 'data_type': data_type, 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': '2000-01-01T00:00:00Z'})
def test_process_item_nonexistent_pointer(kwargs): spider = spider_with_crawler(**kwargs) pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(release_package), 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': 'error: /nonexistent not found'})
def test_process_item_error(): pipeline = Validate() item = File({ 'data': 'data', 'data_type': 'release_package', 'url': 'http://test.com', }) with pytest.raises(ValidationError): pipeline.process_item(item, None) item['file_name'] = 'test' item['data_type'] = 'not a valid data type' with pytest.raises(ValidationError): pipeline.process_item(item, None)
def test_process_file_with_sample(): pipeline = Sample() spider = spider_with_crawler(sample=1) crawler = MagicMock() spider.crawler = crawler item = File({ 'file_name': 'test', 'data': 'data', 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == item with pytest.raises(DropItem): pipeline.process_item(item, spider)
def test_process_item_incomplete_json(): spider = spider_with_crawler(package_pointer='/publishedDate') pipeline = Pluck() item = File({ 'file_name': 'test', 'data': b'{"key": "value"', 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == { 'value': 'error: /publishedDate not found within initial bytes' }
def test_process_item(): pipeline = Validate() item = File({ 'file_name': 'test', 'data': 'data', 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, None) == item item['data'] = item['data'].encode('ascii') item['file_name'] = 'test2' assert pipeline.process_item(item, None) == item
def test_process_item(data_type, data): spider = spider_with_crawler() spider.latest = True pipeline = LatestReleaseDate() item = File({ 'file_name': 'test', 'data': json.dumps(data), 'data_type': data_type, 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == LatestReleaseDateItem( {'date': '2020-10-01'}) spider.latest = False spider.name = 'other' assert pipeline.process_item(item, spider) == item
def test_build_file(): spider = BaseSpider(name='test') data = b'{"key": "value"}' url = 'https://example.com/remote.json' actual = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', 'data': b'{"key": "value"}', "data_type": 'release_package', "url": 'https://example.com/remote.json', 'encoding': 'iso-8859-1', })
def test_build_file_from_response(): spider = BaseSpider(name='test') response = Mock() response.body = b'{"key": "value"}' response.request = Mock() response.request.url = 'https://example.com/remote.json' actual = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', 'data': b'{"key": "value"}', "data_type": 'release_package', "url": 'https://example.com/remote.json', 'encoding': 'iso-8859-1', 'post_to_api': True, })
def test_data_types(data_type, data, root_path): spider = spider_with_crawler() spider.root_path = root_path root_path_middleware = RootPathMiddleware() add_package_middleware = AddPackageMiddleware() item = File({ 'file_name': 'test', 'data': data, 'data_type': data_type, 'url': 'http://test.com', 'encoding': 'utf-8' }) generator = root_path_middleware.process_spider_output(None, [item], spider) item = next(generator) generator = add_package_middleware.process_spider_output(None, [item], spider) item = next(generator) expected = { 'file_name': 'test', 'url': 'http://test.com', 'encoding': 'utf-8', } if root_path: expected['number'] = 1 if 'package' in data_type: expected['data'] = {f"{data_type[:-8]}s": [{"ocid": "abc"}], "uri": "test"} expected['data_type'] = data_type else: expected['data'] = {f"{data_type}s": [{"ocid": "abc"}]} expected['data_type'] = f'{data_type}_package' assert item == expected
from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDataMiddleware, ResizePackageMiddleware, RootPathMiddleware) from tests import response_fixture, spider_with_crawler @pytest.mark.parametrize('middleware_class', [ AddPackageMiddleware, LineDelimitedMiddleware, ResizePackageMiddleware, RootPathMiddleware, ReadDataMiddleware, ]) @pytest.mark.parametrize('item', [ File({ 'file_name': 'test', 'data': 'data', 'data_type': 'release_package', 'url': 'http://test.com', }), FileError({ 'file_name': 'test', 'url': 'http://test.com', 'errors': '' }), ]) def test_yield_items(middleware_class, item): spider = spider_with_crawler() middleware = middleware_class() generator = middleware.process_spider_output(None, [item], spider) returned_item = next(generator)
with open(tmpdir.join(path)) as f: assert f.read() == '{"key": "value"}' assert item['path'] == path assert item['files_store'] == tmpdir @pytest.mark.parametrize('sample,directory', [ (None, os.path.join('test', '20010203_040506')), ('true', os.path.join('test_sample', '20010203_040506')), ]) @pytest.mark.parametrize('data', [b'{"key": "value"}', {"key": "value"}]) @pytest.mark.parametrize('item,expected_file_name', [(File({ 'file_name': 'file.json', 'encoding': 'iso-8859-1' }), 'file.json'), (FileItem({ 'number': 1, 'file_name': 'file.json' }), 'file-1.json')]) def test_item_scraped_with_file_and_file_item(sample, directory, data, item, expected_file_name, tmpdir): spider = spider_with_files_store(tmpdir, sample=sample) extension = KingfisherFilesStore.from_crawler(spider.crawler) path = os.path.join(directory, expected_file_name) original_file_name = item['file_name'] item['data'] = data extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: assert f.read() == '{"key": "value"}'