Пример #1
0
    async def test_should_raise_error_when_decoder_is_not_callable(
            self, decoder):
        with pytest.raises(TypeError) as exc_info:
            async for item in read_mp('foo', decoder=decoder):
                print(item)

        assert f'{decoder} is not callable' == str(exc_info.value)
Пример #2
0
    async def test_should_save_correct_output_when_giving_file_url(self, page_1_file_url, tmp_path, browser):
        backup_path = tmp_path / 'backup.mp'
        config = Configuration(
            item_processors=[self.processor],
            backup_filename=f'{backup_path}',
            selenium_driver_log_file=None,
            selenium_browser=browser
        )
        spider = SeleniumSpider(urls=[page_1_file_url], parse=self.parse, config=config)
        await spider.run()
        stats = spider.statistics()
        followed_urls = {
            page_1_file_url.replace('1', '2'),
            page_1_file_url.replace('1', '3')
        }

        assert followed_urls == stats.followed_urls
        assert {page_1_file_url} | followed_urls == stats.reachable_urls
        assert stats.total_time > 0
        assert stats.average_fetch_time == spider._total_fetch_time / stats.request_counter
        assert set() == stats.unreachable_urls
        assert set() == stats.robot_excluded_urls
        assert 3 == stats.request_counter
        assert stats.total_time > 0

        albert_count = 0
        async for item in read_mp(backup_path, decoder=datetime_decoder):
            assert isinstance(item['date'], datetime)
            if item['author'] == 'Albert Einstein':
                print(item)
                albert_count += 1

        assert albert_count == 3
Пример #3
0
    async def test_should_write_bytes_when_giving_content_in_write_mode_without_custom_encoder(
            self, trio_tmp_path):
        content = {'name': 'Kevin', 'fruit': 'water melon'}
        mp_file = trio_tmp_path / 'data.mp'
        length = await write_mp(mp_file, content, mode='w')

        assert length > 0
        assert [content] == [item async for item in read_mp(mp_file)]
Пример #4
0
    async def test_should_return_python_objects_when_reading_file_without_custom_decoder(
            self, tmp_path, create_msgpack_file):
        given_data = [[1, 2], 'hello', {'fruit': 'apple'}]
        mp_file = tmp_path / 'data.mp'
        create_msgpack_file(mp_file, given_data)

        for file in [f'{mp_file}', trio.Path(mp_file)]:
            assert [item async for item in read_mp(file)] == given_data
Пример #5
0
    async def test_should_write_bytes_when_giving_content_in_append_mode_without_custom_encoder(
            self, trio_tmp_path):
        content = ['foo', 4, {'fruit': 'water melon'}, [1, 4]]
        mp_file = trio_tmp_path / 'data.mp'

        for item in content:
            length = await write_mp(mp_file, item, mode='a')
            assert length > 0

        assert content == [item async for item in read_mp(mp_file)]
Пример #6
0
    async def test_should_return_python_objects_when_reading_file_with_custom_decoder(
            self, tmp_path, decode_datetime, create_msgpack_file):
        given_data = ['hello', datetime.now()]
        mp_file = tmp_path / 'data.mp'
        create_msgpack_file(mp_file, given_data)

        for file in [str(mp_file), trio.Path(mp_file)]:
            assert [
                item async for item in read_mp(file, decoder=decode_datetime)
            ] == given_data
Пример #7
0
    async def common_assert(stats: SpiderStatistics, backup_path: Path):
        assert stats.unreachable_urls == set()
        assert stats.robot_excluded_urls == set()
        assert stats.total_time > 0

        albert_count = 0
        async for item in read_mp(backup_path, decoder=datetime_decoder):
            assert isinstance(item['date'], datetime)
            if item['author'] == 'Albert Einstein':
                albert_count += 1

        assert albert_count == 3
Пример #8
0
async def main() -> None:
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(backup_filename=f'{backup}',
                           item_processors=[date_processor])
    spider = StaticSpider(urls=['http://quotes.toscrape.com'],
                          parse=parse,
                          config=config)
    await spider.run()
    print(spider.statistics())
    # you can do whatever you want with the results
    async for item in read_mp(backup, decoder=spider.config.msgpack_decoder):
        print(item)
Пример #9
0
async def main() -> None:
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(selenium_driver_log_file=None,
                           backup_filename=f'{backup}',
                           item_processors=[date_processor])
    spider = SeleniumSpider(urls=['http://quotes.toscrape.com'],
                            parse=parse,
                            config=config)
    await spider.run()
    print(spider.statistics())
    # you can do whatever you want with the results
    async for quote in read_mp(filename=backup, decoder=datetime_decoder):
        print(quote)
Пример #10
0
    async def test_should_write_bytes_when_giving_content_in_write_mode_with_custom_encoder(
            self, trio_tmp_path, encode_datetime, decode_datetime):
        content = {'name': 'Kevin', 'date': datetime.now()}
        mp_file = trio_tmp_path / 'data.mp'
        length = await write_mp(f'{mp_file}',
                                content,
                                mode='w',
                                encoder=encode_datetime)

        assert length > 0
        assert [content] == [
            item async for item in read_mp(mp_file, decoder=decode_datetime)
        ]
Пример #11
0
    async def test_should_write_bytes_when_giving_content_in_append_mode_with_custom_encoder(
            self, trio_tmp_path, encode_datetime, decode_datetime):
        content = ['foo', datetime.now()]
        mp_file = trio_tmp_path / 'data.mp'

        for item in content:
            length = await write_mp(f'{mp_file}',
                                    item,
                                    mode='a',
                                    encoder=encode_datetime)
            assert length > 0

        assert content == [
            item async for item in read_mp(mp_file, decoder=decode_datetime)
        ]
Пример #12
0
async def main() -> None:
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(selenium_driver_log_file=None,
                           backup_filename=f'{backup}',
                           item_processors=[date_processor])
    sel_spider = SeleniumSpider(urls=['http://httpbin.org/'],
                                parse=parse,
                                config=config)
    await sel_spider.run()
    print(sel_spider.statistics())
    # you can do whatever you want with the results
    async for quote_data in read_mp(filename=backup, decoder=datetime_decoder):
        print('****', quote_data['title'], '****')
        print(quote_data['description'])
        print('== operations ==')
        for operation in quote_data['operations']:
            print('\tmethod:', operation['method'])
            print('\tpath:', operation['path'])
            print('\tdescription:', operation['description'], end='\n\n')
Пример #13
0
    async def test_should_save_content_to_backup_file(self, tmp_path, capsys):
        def processor(item):
            print("I'm a processor")
            return item

        backup = tmp_path / 'backup.mp'
        fruit_1 = {'fruit': 'pineapple'}
        fruit_2 = {'fruit': 'orange'}
        config = Configuration(backup_filename=f'{backup.resolve()}',
                               item_processors=[processor])
        static_spider = StaticSpider(urls=['https://foo.com'],
                                     parse=lambda x, y: None,
                                     config=config)
        await static_spider.save_item(fruit_1)
        await static_spider.save_item(fruit_2)
        out, _ = capsys.readouterr()

        assert [fruit_1, fruit_2
                ] == [item async for item in read_mp(f'{backup.resolve()}')]
        assert "I'm a processor" in out