def test_crawl_time_invalid():
    expected = "spider argument `crawl_time`: invalid date value: time data '2020' does not match format " \
               "'%Y-%m-%dT%H:%M:%S'"

    with pytest.raises(SpiderArgumentError) as e:
        spider_with_crawler(crawl_time='2020')
    assert str(e.value) == expected
def test_until_date_invalid():
    expected = "spider argument `until_date`: invalid date value: time data 'invalid' does not match format '%Y-%m-%d'"

    with pytest.raises(SpiderArgumentError) as e:
        spider_with_crawler(until_date='invalid',
                            default_from_date='2000-01-01')
    assert str(e.value) == expected
示例#3
0
def test_from_crawler():
    with pytest.raises(SpiderArgumentError) as excinfo:
        spider_with_crawler(release_pointer='/date',
                            package_pointer='/publishedDate')

    assert str(
        excinfo.value
    ) == 'You cannot specify both package_pointer and release_pointer spider arguments.'
def test_custom_collection_data_version():
    error_message = "time data '2020' does not match format '%Y-%m-%dT%H:%M:%S'"

    assert spider_with_crawler(crawl_time='2020-01-01T00:00:00')
    with pytest.raises(SpiderArgumentError) as e:
        assert spider_with_crawler(crawl_time='2020')
    assert str(
        e.value
    ) == f'spider argument `crawl_time`: invalid date value: {error_message}'
示例#5
0
def test_parse():
    spider = spider_with_crawler(spider_class=CompressedFileSpider)
    spider.data_type = 'release_package'

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', '{}')

    response = response_fixture(body=io.getvalue(),
                                meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    assert type(item) is File
    assert item == {
        'file_name': 'test.json',
        'url': 'http://example.com',
        'data': b'{}',
        'data_type': 'release_package',
        'encoding': 'utf-8',
        'post_to_api': True,
    }

    with pytest.raises(StopIteration):
        next(generator)
def test_line_delimited_json_middleware_compressed(sample):
    spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
    spider.data_type = 'release_package'
    spider.line_delimited = True

    middleware = LineDelimitedMiddleware()

    content = []
    for i in range(1, 21):
        content.append('{"key": %s}\n' % i)

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', ''.join(content))

    response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    generator = middleware.process_spider_output(response, [item], spider)
    transformed_items = list(generator)

    for i, item in enumerate(transformed_items, 1):
        assert type(item) is FileItem
        assert item == {
            'file_name': 'test.json',
            'url': 'http://example.com',
            'number': i,
            'data': '{"key": %s}\n' % i,
            'data_type': 'release_package',
            'encoding': 'utf-8'
        }
def test_middleware_output(meta, expected):
    spider = spider_with_crawler()
    middleware = DelayedRequestMiddleware()
    request = Request('http://example.com', meta=meta)
    output = middleware.process_request(request, spider)

    assert isinstance(output, expected)
示例#8
0
def test_item_scraped_with_build_file_item():
    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider = spider_with_crawler(settings={'FILES_STORE': files_store})
        extension = KingfisherFilesStore.from_crawler(spider.crawler)

        assert extension.item_scraped(spider.build_file_item(), spider) is None
def test_parse_rar_file():
    spider = spider_with_crawler(spider_class=CompressedFileSpider)
    spider.data_type = 'release_package'
    spider.archive_format = 'rar'

    # the rar library does'nt support the write mode so we use a static rar file
    rar_file_path = os.path.join(
        pathlib.Path(__file__).parent.absolute(), 'data', 'test.rar')
    with open(rar_file_path, 'rb') as f:
        io = BytesIO(f.read())
    response = response_fixture(body=io.getvalue())
    generator = spider.parse(response)
    item = next(generator)

    assert type(item) is File
    assert item == {
        'file_name': 'test.json',
        'url': 'http://example.com',
        'data': b'',
        'data_type': 'release_package',
        'encoding': 'utf-8',
        'post_to_api': True
    }

    with pytest.raises(StopIteration):
        next(generator)
def test_parse_release_package(sample, len_releases):
    spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
    spider.data_type = 'release_package'
    spider.resize_package = True

    middleware = ResizePackageMiddleware()

    package = {'releases': []}
    for i in range(200):
        package['releases'].append({'key': 'value'})

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', json.dumps(package))

    response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    generator = middleware.process_spider_output(response, [item], spider)
    transformed_items = list(generator)

    for i, item in enumerate(transformed_items, 1):
        assert type(item) is FileItem
        assert len(item) == 6
        assert item['file_name'] == 'test.json'
        assert item['url'] == 'http://example.com'
        assert item['number'] == i
        assert len(json.loads(item['data'])['releases']) == len_releases
        assert item['data_type'] == 'release_package'
        assert item['encoding'] == 'utf-8'
示例#11
0
def test_next_link_condition():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.from_date = spider.until_date = date(2002, 12, 31)

    request = spider.next_link(response_fixture(body='{"links": {"next": ""}}'))

    assert type(request) is NoneType
示例#12
0
def test_parse_zipfile_200():
    spider = spider_with_crawler(spider_class=ZipSpider)

    response = text.TextResponse('test')
    response.status = 200
    response.request = Mock()
    response.request.meta = {'kf_filename': 'test.json'}
    response.request.url = 'url'

    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider.crawler.settings['FILES_STORE'] = files_store
        tmp = os.path.join(files_store, 'test', '20010203_040506')
        os.makedirs(tmp)

        with open(os.path.join(tmp, 'test'), 'w'):
            pass
        with ZipFile(os.path.join(tmp, 'test.zip'), 'w') as z:
            z.write(os.path.join(tmp, 'test'))
        with open(os.path.join(tmp, 'test.zip'), 'rb') as z:
            response = response.replace(body=z.read())

        actual = spider.parse_zipfile(response, None).__next__()

        assert isinstance(actual, File)
        assert actual['file_name'].find('.json')
def spider_with_files_store(files_store, **kwargs):
    spider = spider_with_crawler(**kwargs)
    spider.crawler.settings['FILES_STORE'] = files_store
    spider.crawler.settings['KINGFISHER_API_URI'] = 'http://httpbin.org/anything'
    spider.crawler.settings['KINGFISHER_API_KEY'] = 'xxx'

    return spider
def test_parse_200():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.data_type = 'release_package'
    spider.next_page_formatter = lambda url: 'next.json'

    generator = spider.parse(response_fixture())
    item = next(generator)
    request = next(generator)

    assert type(item) is File
    assert item == {
        'file_name': 'test',
        'url': 'http://example.com',
        'data': b'{"links": {"next": "http://example.com/next"}}',
        'data_type': 'release_package',
        'encoding': 'utf-8',
        'post_to_api': True,
    }

    assert type(request) is Request
    assert request.url == 'http://example.com/next'
    assert request.meta == {'file_name': 'next.json'}

    with pytest.raises(StopIteration):
        next(generator)
示例#15
0
def test_from_crawler_missing_arguments():
    spider = spider_with_crawler()

    with pytest.raises(NotConfigured) as excinfo:
        KingfisherFilesStore.from_crawler(spider.crawler)

    assert str(excinfo.value) == 'FILES_STORE is not set.'
def test_qs_parameters(kwargs, expected):
    test_spider = type('TestSpider', (BaseSpider, ), {
        'start_requests':
        lambda _self: [scrapy.Request('http://example.com')]
    })
    spider = spider_with_crawler(test_spider, **kwargs)

    for request in spider.start_requests():
        assert expected in request.url
def test_yield_items(middleware_class, item):
    spider = spider_with_crawler()

    middleware = middleware_class()

    generator = middleware.process_spider_output(None, [item], spider)
    returned_item = next(generator)

    assert item == returned_item
def test_from_crawler_missing_arguments(api_url, api_key):
    spider = spider_with_crawler()
    spider.crawler.settings['KINGFISHER_API_URI'] = api_url
    spider.crawler.settings['KINGFISHER_API_KEY'] = api_key

    with pytest.raises(NotConfigured) as excinfo:
        KingfisherProcessAPI.from_crawler(spider.crawler)

    assert str(excinfo.value) == 'KINGFISHER_API_URI and/or KINGFISHER_API_KEY is not set.'
def test_from_crawler():
    spider = spider_with_crawler()
    spider.crawler.settings['KINGFISHER_API_URI'] = 'http://httpbin.org/anything'
    spider.crawler.settings['KINGFISHER_API_KEY'] = 'xxx'
    spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = 'localdir'

    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    assert api_extension.directory == 'localdir'
def test_next_link():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.next_page_formatter = lambda url: 'next.json'

    request = spider.next_link(response_fixture())

    assert type(request) is Request
    assert request.url == 'http://example.com/next'
    assert request.meta == {'file_name': 'next.json'}
def test_process_file_without_sample():
    pipeline = Sample()
    spider = spider_with_crawler()
    item = File({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
    })
    assert pipeline.process_item(item, spider) == item
def test_process_item_file_error():
    pipeline = Sample()
    spider = spider_with_crawler(sample=1)
    item = FileError({
        'file_name': 'test',
        'url': 'http://test.com',
        'errors': 'error',
    })
    with pytest.raises(DropItem):
        pipeline.process_item(item, spider)
def test_from_crawler():
    spider = spider_with_crawler(settings={
        'KINGFISHER_API_URI': 'http://httpbin.org/anything',
        'KINGFISHER_API_KEY': 'xxx',
        'KINGFISHER_API_LOCAL_DIRECTORY': 'localdir',
    })

    extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    assert extension.directory == 'localdir'
示例#24
0
def test_next_link():
    spider = spider_with_crawler(spider_class=LinksSpider)

    url = 'https://example.com/remote.json'
    text_response = text.TextResponse('test')
    response = text_response.replace(body='{"links": {"next": "' + url + '"}}')

    actual = spider.next_link(response)

    assert actual.url == url
def test_date_arguments():
    test_date = '2000-01-01'
    error_message = "time data 'test' does not match format '%Y-%m-%d'"

    assert spider_with_crawler(from_date=test_date)
    with pytest.raises(SpiderArgumentError) as e:
        assert spider_with_crawler(from_date='test')
    assert str(
        e.value
    ) == f'spider argument `from_date`: invalid date value: {error_message}'

    assert spider_with_crawler(until_date=test_date,
                               default_from_date=test_date)
    with pytest.raises(SpiderArgumentError) as e:
        assert spider_with_crawler(until_date='test',
                                   default_from_date=test_date)
    assert str(
        e.value
    ) == f'spider argument `until_date`: invalid date value: {error_message}'
示例#26
0
def test_spider_closed_without_items():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(
            settings={'KINGFISHER_PLUCK_PATH': tmpdirname},
            release_pointer='/date')
        extension = KingfisherPluck.from_crawler(spider.crawler)

        extension.spider_closed(spider, 'itemcount')

        with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
            assert 'closed: itemcount,test\n' == f.read()
示例#27
0
def test_process_item_xlsx():
    spider = spider_with_crawler(unflatten=True)
    pipeline = Unflatten()
    item = File({
        'file_name': 'test.xlsx',
        'data': save_virtual_workbook(Workbook()),
        'data_type': 'release_package',
        'url': 'http://test.com/test.xlsx',
    })

    assert pipeline.process_item(item, spider) == item
示例#28
0
def test_parse_zipfile_release_package():
    spider = spider_with_crawler(spider_class=ZipSpider)

    response = text.TextResponse('test')
    response.status = 200
    response.request = Mock()
    response.request.meta = {'kf_filename': 'test.json'}
    response.request.url = 'url'

    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider.crawler.settings['FILES_STORE'] = files_store
        tmp = os.path.join(files_store, 'test', '20010203_040506')
        os.makedirs(tmp)

        with open(os.path.join(tmp, 'test.json'), 'w') as f:
            release = {
                'releases': [],
                'publisher': {
                    'name': 'test'
                },
                'extensions': ['a', 'b'],
                'license': 'test',
                'extra': 1.1
            }
            for i in range(110):
                release['releases'].append({'key': 'value'})
            json.dump(release, f)
        with ZipFile(os.path.join(tmp, 'test.zip'), 'w') as z:
            z.write(os.path.join(tmp, 'test.json'))
        with open(os.path.join(tmp, 'test.zip'), 'rb') as z:
            response = response.replace(body=z.read())

        actual = spider.parse_zipfile(
            response, None, file_format='release_package').__next__()
        data = json.loads(actual['data'])

        assert isinstance(actual, FileItem)
        assert actual['number'] == 1
        assert data['publisher']['name'] == 'test'
        assert data['extensions'] == ['a', 'b']
        assert len(data['releases']) == spider.MAX_RELEASES_PER_PACKAGE

        spider.sample = True
        total = 0
        for item in spider.parse_zipfile(response,
                                         None,
                                         file_format='release_package'):
            total = total + 1
            data = json.loads(item['data'])
            assert isinstance(item, FileItem)
            assert item['number'] == total
            assert len(data['releases']) == spider.MAX_SAMPLE
        assert total == 1
def test_build_file_with_existing_directory():
    spider = spider_with_crawler()

    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider.crawler.settings['FILES_STORE'] = files_store
        store_extension = KingfisherFilesStore.from_crawler(spider.crawler)
        os.makedirs(os.path.join(files_store, 'test', '20010203_040506'))

        # No FileExistsError exception.
        store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider)
示例#30
0
def test_process_item_csv():
    spider = spider_with_crawler(unflatten=True)
    pipeline = Unflatten()
    item = File({
        'file_name': 'test.csv',
        'data': b'data',
        'data_type': 'release_package',
        'url': 'http://test.com/test.csv',
    })

    assert pipeline.process_item(item, spider) == item