예제 #1
0
def test_duplicate_file(caplog):
    pipeline = Validate()
    spider = spider_with_crawler()
    item = File({
        'file_name': 'test1',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://example.com',
    })

    pipeline.process_item(item, spider)
    pipeline.process_item(item, spider)
    item2 = item.copy()
    item2['file_name'] = 'file2'
    pipeline.process_item(item2, spider)

    assert len(caplog.messages) == 1
    assert caplog.messages[0] == "Duplicate File: 'test1'"
def test_process_file_without_sample():
    pipeline = Sample()
    spider = spider_with_crawler()
    item = File({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
    })
    assert pipeline.process_item(item, spider) == item
예제 #3
0
def test_process_item_xlsx():
    spider = spider_with_crawler(unflatten=True)
    pipeline = Unflatten()
    item = File({
        'file_name': 'test.xlsx',
        'data': save_virtual_workbook(Workbook()),
        'data_type': 'release_package',
        'url': 'http://test.com/test.xlsx',
    })

    assert pipeline.process_item(item, spider) == item
예제 #4
0
def test_process_item_csv():
    spider = spider_with_crawler(unflatten=True)
    pipeline = Unflatten()
    item = File({
        'file_name': 'test.csv',
        'data': b'data',
        'data_type': 'release_package',
        'url': 'http://test.com/test.csv',
    })

    assert pipeline.process_item(item, spider) == item
예제 #5
0
def test_disabled(data_type, data):
    spider = spider_with_crawler()

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(data),
        'data_type': data_type,
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == item
예제 #6
0
def test_process_item_xlsx_error():
    spider = spider_with_crawler(unflatten=True)
    pipeline = Unflatten()
    item = File({
        'file_name': 'test.xlsx',
        'data': b'data',
        'data_type': 'release_package',
        'url': 'http://test.com/test.xlsx',
    })

    with pytest.raises(BadXLSXZipFile):
        pipeline.process_item(item, spider)
예제 #7
0
def test_process_item_extension_error():
    spider = spider_with_crawler(unflatten=True)
    pipeline = Unflatten()
    item = File({
        'file_name': 'file',
        'data': b'data',
        'data_type': 'release_package',
        'url': 'http://test.com/file',
    })

    with pytest.raises(NotImplementedError):
        pipeline.process_item(item, spider)
예제 #8
0
    def parse(self, response):
        archive_name, archive_format = get_file_name_and_extension(
            response.request.meta['file_name'])

        if archive_format == 'zip':
            cls = ZipFile
        elif archive_format == 'rar':
            cls = RarFile
        else:
            raise UnknownArchiveFormatError(response.request.meta['file_name'])

        # If we use a context manager here, the archive file might close before the item pipeline reads from the file
        # handlers of the compressed files.
        archive_file = cls(BytesIO(response.body))

        number = 1
        for file_info in archive_file.infolist():
            # Avoid reading the rest of a large file, since the rest of the items will be dropped.
            if self.sample and number > self.sample:
                break

            filename = file_info.filename
            basename = os.path.basename(filename)
            if self.file_name_must_contain not in basename:
                continue
            if archive_format == 'rar' and file_info.isdir():
                continue
            if archive_format == 'zip' and file_info.is_dir():
                continue
            if not basename.endswith('.json'):
                basename += '.json'

            compressed_file = archive_file.open(filename)

            # If `resize_package = True`, then we need to open the file twice: once to extract the package metadata and
            # then to extract the releases themselves.
            if self.resize_package:
                data = {
                    'data': compressed_file,
                    'package': archive_file.open(filename)
                }
            else:
                data = compressed_file

            yield File({
                'file_name': basename,
                'data': data,
                'data_type': self.data_type,
                'url': response.request.url,
                'encoding': self.encoding
            })

            number += 1
예제 #9
0
 def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True):
     """
     Returns a File item to yield.
     """
     return File({
         'file_name': file_name,
         'data': data,
         'data_type': data_type,
         'url': url,
         'encoding': encoding,
         'post_to_api': post_to_api,
     })
예제 #10
0
def test_process_item_non_package_data_type():
    spider = spider_with_crawler(package_pointer='/publishedDate')

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(releases[0]),
        'data_type': 'release',
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': 'error: /publishedDate not found'})
예제 #11
0
def test_process_item_release_pointer(data_type, data):
    spider = spider_with_crawler(release_pointer='/date', truncate=10)

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(data),
        'data_type': data_type,
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': '2020-10-01'})
예제 #12
0
def test_process_item_package_pointer(data_type, data):
    spider = spider_with_crawler(package_pointer='/publishedDate')

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(data),
        'data_type': data_type,
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': '2000-01-01T00:00:00Z'})
예제 #13
0
def test_process_item_nonexistent_pointer(kwargs):
    spider = spider_with_crawler(**kwargs)

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': json.dumps(release_package),
        'data_type': 'release_package',
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == PluckedItem(
        {'value': 'error: /nonexistent not found'})
예제 #14
0
def test_process_item_error():
    pipeline = Validate()
    item = File({
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
    })

    with pytest.raises(ValidationError):
        pipeline.process_item(item, None)
    item['file_name'] = 'test'
    item['data_type'] = 'not a valid data type'
    with pytest.raises(ValidationError):
        pipeline.process_item(item, None)
예제 #15
0
def test_process_file_with_sample():
    pipeline = Sample()
    spider = spider_with_crawler(sample=1)
    crawler = MagicMock()
    spider.crawler = crawler
    item = File({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
    })
    assert pipeline.process_item(item, spider) == item
    with pytest.raises(DropItem):
        pipeline.process_item(item, spider)
예제 #16
0
def test_process_item_incomplete_json():
    spider = spider_with_crawler(package_pointer='/publishedDate')

    pipeline = Pluck()
    item = File({
        'file_name': 'test',
        'data': b'{"key": "value"',
        'data_type': 'release_package',
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == {
        'value': 'error: /publishedDate not found within initial bytes'
    }
예제 #17
0
def test_process_item():
    pipeline = Validate()
    item = File({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, None) == item

    item['data'] = item['data'].encode('ascii')
    item['file_name'] = 'test2'

    assert pipeline.process_item(item, None) == item
def test_process_item(data_type, data):
    spider = spider_with_crawler()
    spider.latest = True

    pipeline = LatestReleaseDate()
    item = File({
        'file_name': 'test',
        'data': json.dumps(data),
        'data_type': data_type,
        'url': 'http://test.com',
    })

    assert pipeline.process_item(item, spider) == LatestReleaseDateItem(
        {'date': '2020-10-01'})

    spider.latest = False
    spider.name = 'other'

    assert pipeline.process_item(item, spider) == item
예제 #19
0
def test_build_file():
    spider = BaseSpider(name='test')

    data = b'{"key": "value"}'
    url = 'https://example.com/remote.json'

    actual = spider.build_file(file_name='file.json',
                               url=url,
                               data=data,
                               data_type='release_package',
                               encoding='iso-8859-1')

    assert actual == File({
        'file_name': 'file.json',
        'data': b'{"key": "value"}',
        "data_type": 'release_package',
        "url": 'https://example.com/remote.json',
        'encoding': 'iso-8859-1',
    })
예제 #20
0
def test_build_file_from_response():
    spider = BaseSpider(name='test')

    response = Mock()
    response.body = b'{"key": "value"}'
    response.request = Mock()
    response.request.url = 'https://example.com/remote.json'

    actual = spider.build_file_from_response(response,
                                             'file.json',
                                             data_type='release_package',
                                             encoding='iso-8859-1')

    assert actual == File({
        'file_name': 'file.json',
        'data': b'{"key": "value"}',
        "data_type": 'release_package',
        "url": 'https://example.com/remote.json',
        'encoding': 'iso-8859-1',
        'post_to_api': True,
    })
def test_data_types(data_type, data, root_path):
    spider = spider_with_crawler()
    spider.root_path = root_path

    root_path_middleware = RootPathMiddleware()
    add_package_middleware = AddPackageMiddleware()

    item = File({
        'file_name': 'test',
        'data': data,
        'data_type': data_type,
        'url': 'http://test.com',
        'encoding': 'utf-8'
    })

    generator = root_path_middleware.process_spider_output(None, [item], spider)
    item = next(generator)
    generator = add_package_middleware.process_spider_output(None, [item], spider)
    item = next(generator)

    expected = {
        'file_name': 'test',
        'url': 'http://test.com',
        'encoding': 'utf-8',
    }
    if root_path:
        expected['number'] = 1

    if 'package' in data_type:
        expected['data'] = {f"{data_type[:-8]}s": [{"ocid": "abc"}], "uri": "test"}
        expected['data_type'] = data_type
    else:
        expected['data'] = {f"{data_type}s": [{"ocid": "abc"}]}
        expected['data_type'] = f'{data_type}_package'

    assert item == expected
from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDataMiddleware,
                                           ResizePackageMiddleware, RootPathMiddleware)
from tests import response_fixture, spider_with_crawler


@pytest.mark.parametrize('middleware_class', [
    AddPackageMiddleware,
    LineDelimitedMiddleware,
    ResizePackageMiddleware,
    RootPathMiddleware,
    ReadDataMiddleware,
])
@pytest.mark.parametrize('item', [
    File({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
    }),
    FileError({
        'file_name': 'test',
        'url': 'http://test.com',
        'errors': ''
    }),
])
def test_yield_items(middleware_class, item):
    spider = spider_with_crawler()

    middleware = middleware_class()

    generator = middleware.process_spider_output(None, [item], spider)
    returned_item = next(generator)
예제 #23
0
    with open(tmpdir.join(path)) as f:
        assert f.read() == '{"key": "value"}'

    assert item['path'] == path
    assert item['files_store'] == tmpdir


@pytest.mark.parametrize('sample,directory', [
    (None, os.path.join('test', '20010203_040506')),
    ('true', os.path.join('test_sample', '20010203_040506')),
])
@pytest.mark.parametrize('data', [b'{"key": "value"}', {"key": "value"}])
@pytest.mark.parametrize('item,expected_file_name',
                         [(File({
                             'file_name': 'file.json',
                             'encoding': 'iso-8859-1'
                         }), 'file.json'),
                          (FileItem({
                              'number': 1,
                              'file_name': 'file.json'
                          }), 'file-1.json')])
def test_item_scraped_with_file_and_file_item(sample, directory, data, item,
                                              expected_file_name, tmpdir):
    spider = spider_with_files_store(tmpdir, sample=sample)
    extension = KingfisherFilesStore.from_crawler(spider.crawler)
    path = os.path.join(directory, expected_file_name)
    original_file_name = item['file_name']
    item['data'] = data
    extension.item_scraped(item, spider)
    with open(tmpdir.join(path)) as f:
        assert f.read() == '{"key": "value"}'