def test_should_return_python_objects_when_reading_file_without_custom_decoder(self, tmp_path, create_msgpack_file): given_data = [[1, 2], 'hello', {'fruit': 'apple'}] mp_file = tmp_path / 'data.mp' create_msgpack_file(mp_file, given_data) for file in [mp_file, f'{mp_file}']: assert [item for item in read_mp(file)] == given_data
def test_should_write_bytes_when_giving_content_in_write_mode_without_custom_encoder(self, tmp_path): content = {'name': 'Kevin', 'fruit': 'water melon'} mp_file = tmp_path / 'data.mp' length = write_mp(mp_file, content, mode='w') assert length > 0 assert [content] == [item for item in read_mp(mp_file)]
def test_should_save_correct_output_when_giving_file_url( self, page_1_file_url, tmp_path, browser): backup_path = tmp_path / 'backup.mp' config = Configuration(item_processors=[self.processor], backup_filename=f'{backup_path}', selenium_driver_log_file=None, selenium_browser=browser) spider = SeleniumSpider(urls=[page_1_file_url], parse=self.parse, config=config) spider.run() stats = spider.statistics() followed_urls = { page_1_file_url.replace('1', '2'), page_1_file_url.replace('1', '3') } assert followed_urls == stats.followed_urls assert {page_1_file_url} | followed_urls == stats.reachable_urls assert 3 == stats.request_counter assert stats.total_time > 0 assert stats.average_fetch_time == spider._total_fetch_time / stats.request_counter assert set() == stats.unreachable_urls assert set() == stats.robot_excluded_urls assert stats.total_time > 0 albert_count = 0 for item in read_mp(backup_path, decoder=datetime_decoder): assert isinstance(item['date'], datetime) if item['author'] == 'Albert Einstein': print(item) albert_count += 1 assert albert_count == 3
def test_should_return_python_object_when_reading_file_with_custom_decoder( self, tmp_path, create_msgpack_file, decode_datetime ): given_data = ['hello', datetime.now()] mp_file = tmp_path / 'data.mp' create_msgpack_file(mp_file, given_data) assert [item for item in read_mp(mp_file, decoder=decode_datetime)] == given_data
def test_should_write_bytes_when_giving_content_in_append_mode_without_custom_encoder(self, tmp_path): content = ['foo', 4, {'fruit': 'water melon'}, [1, 4]] mp_file = tmp_path / 'data.mp' for item in content: length = write_mp(mp_file, item, mode='a') assert length > 0 assert content == [item for item in read_mp(mp_file)]
def test_should_write_bytes_when_giving_content_in_write_mode_with_custom_encoder( self, tmp_path, encode_datetime, decode_datetime ): content = {'name': 'Kevin', 'date': datetime.now()} mp_file = tmp_path / 'data.mp' length = write_mp(f'{mp_file}', content, mode='w', encoder=encode_datetime) assert length > 0 assert [content] == [item for item in read_mp(mp_file, decoder=decode_datetime)]
def test_should_write_bytes_when_giving_content_in_append_mode_with_custom_encoder( self, tmp_path, encode_datetime, decode_datetime ): content = ['foo', datetime.now()] mp_file = tmp_path / 'data.mp' for item in content: length = write_mp(f'{mp_file}', item, mode='a', encoder=encode_datetime) assert length > 0 assert content == [item for item in read_mp(mp_file, decoder=decode_datetime)]
def test_should_raise_error_when_decoder_is_not_callable(self, decoder): with pytest.raises(TypeError) as exc_info: next(read_mp('foo.mp', decoder=decoder)) assert f'{decoder} is not callable' == str(exc_info.value)
next_link = None try: element = response.driver.find_element_by_xpath( '//nav/ul/li[@class="next"]/a') next_link = element.get_attribute('href') except NoSuchElementException: pass if next_link is not None: response.follow(next_link) def date_processor(item: dict) -> dict: item['date'] = datetime.now() return item if __name__ == '__main__': backup = Path(__file__).parent / 'backup.mp' config = Configuration(selenium_driver_log_file=None, backup_filename=f'{backup}', item_processors=[date_processor]) sel_spider = SeleniumSpider(urls=['http://quotes.toscrape.com'], parse=parse, config=config) sel_spider.run() print(sel_spider.statistics()) # you can do whatever you want with the results for quote_data in read_mp(filename=backup, decoder=datetime_decoder): print(quote_data)
'message': quote.xpath('./span[@class="text"]/text()').get(), 'author': quote.xpath('./span/small/text()').get(), 'tags': quote.xpath('./div/a/text()').getall(), } spider.save_item(data) next_link = response.xpath('//nav/ul/li[@class="next"]/a').xpath( '@href').get() if next_link is not None: response.follow(next_link) def date_processor(item: dict) -> dict: item['date'] = datetime.now() return item if __name__ == '__main__': backup = Path(__file__).parent / 'backup.mp' config = Configuration(backup_filename=f'{backup}', item_processors=[date_processor]) spider = StaticSpider(urls=['http://quotes.toscrape.com'], parse=parse, config=config) spider.run() print(spider.statistics()) # you can do whatever you want with the results for quote_data in read_mp(filename=backup, decoder=spider.config.msgpack_decoder): print(quote_data)