def test_parser_and_checksum(filepath): response = _read_response(_data_dir, filepath) raw_parser = RawFeedParser() raw_result = raw_parser.parse(response) assert raw_result.feed assert raw_result.storys parser = FeedParser() result = parser.parse(raw_result) assert result.feed assert result.storys assert result.checksum.size() == len(result.storys)
def test_parse_story_no_id_no_summary_no_url(): # total 3 storys # skip the no id story # story#0 no content no summary, no url # story#1 has content no summary, no url but id is valid url filename = 'well/v2ex-no-id-no-summary-no-url.xml' response = _read_response(_data_dir, filename) raw_result = RawFeedParser().parse(response) assert raw_result.storys # assert skip the no id story assert len(raw_result.storys) == 2 # assert no summary assert not raw_result.storys[0]['summary'] assert not raw_result.storys[1]['summary'] # assert content assert not raw_result.storys[0]['content'] assert raw_result.storys[1]['content'] # assert pick id as url, discard the invalid one assert not raw_result.storys[0]['url'] assert raw_result.storys[1]['url'] result = FeedParser().parse(raw_result) assert result.storys assert len(raw_result.storys) == len(result.storys) # assert content assert not result.storys[0]['content'] assert result.storys[1]['content'] # assert extract summary from content assert not result.storys[0]['summary'] assert result.storys[1]['summary'] # assert pick id as url, discard the invalid one assert not result.storys[0]['url'] assert result.storys[1]['url']
def test_parse_too_many_storys(): items = [] num_storys = 2000 base = datetime.datetime.now() for i in range(num_storys): if i < num_storys // 2: date_published = None else: date_published = (base + datetime.timedelta(seconds=i)).isoformat() items.append({ "id": f"{i}", "content_html": f"content_{i}", "summary": f"summary_{i}", "url": f"https://example.org/post/{i}", "date_published": date_published, }) feed = { "version": "https://jsonfeed.org/version/1", "title": "Too many storys", "home_page_url": "https://example.org/", "feed_url": "https://example.org/feed.json", "items": items } data = json.dumps(feed).encode('utf-8') response = _create_builder(data).build() raw_result = RawFeedParser().parse(response) assert len(raw_result.storys) == num_storys result = FeedParser().parse(raw_result) assert len(result.storys) == _MAX_STORYS expected = set(range(num_storys - _MAX_STORYS, num_storys)) story_ids = {int(x['ident']) for x in result.storys} assert story_ids == expected
def _parse_found(found): response: FeedResponse raw_result: RawFeedResult response, raw_result = found feed = AttrDict() # feed response feed.use_proxy = response.use_proxy feed.url = response.url feed.content_length = len(response.content) feed.content_hash_base64 = compute_hash_base64(response.content) feed.etag = response.etag feed.last_modified = response.last_modified feed.encoding = response.encoding del found, response # release memory in advance # parse feed and storys result = FeedParser().parse(raw_result) del raw_result # release memory in advance feed.title = result.feed['title'] feed.link = result.feed['home_url'] feed.author = result.feed['author_name'] feed.icon = result.feed['icon_url'] feed.description = result.feed['description'] feed.dt_updated = result.feed['dt_updated'] feed.version = result.feed['version'] feed.storys = _get_storys(result.storys) del result # release memory in advance return validate_feed(feed)
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, ): params = dict(etag=etag, last_modified=last_modified) with FeedReader() as reader: status_code, response = reader.read(url, **params) LOG.info(f'read feed#{feed_id} url={unquote(url)} status_code={status_code}') if status_code != 200 or not response: return new_hash = compute_hash_base64(response.content) if new_hash == content_hash_base64: LOG.info(f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!') return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') parsed = FeedParser.parse_response(response) if parsed.bozo: LOG.warning(f'failed parse feed#{feed_id} url={unquote(url)}: {parsed.bozo_exception}') return try: feed = _parse_found(parsed) except Invalid as ex: LOG.warning(f'invalid feed#{feed_id} url={unquote(url)}: {ex}', exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
def _parse_well_feed(filename) -> FeedResult: response = _read_response(_data_dir / 'well', filename) raw_result = RawFeedParser().parse(response) assert raw_result.feed assert raw_result.storys assert not raw_result.warnings result = FeedParser().parse(raw_result) assert len(result.storys) == len(raw_result.storys) return result
def _parse_found(found, checksum_data=None, is_refresh=False): response: FeedResponse raw_result: RawFeedResult response, raw_result = found feed = AttrDict() # feed response feed.use_proxy = response.use_proxy feed.url = response.url feed.content_length = len(response.content) feed.content_hash_base64 = compute_hash_base64(response.content) feed.etag = response.etag feed.last_modified = response.last_modified feed.encoding = response.encoding feed.response_status = response.status del found, response # release memory in advance # parse feed and storys checksum = None if checksum_data and (not is_refresh): checksum = FeedChecksum.load(checksum_data) result = FeedParser(checksum=checksum).parse(raw_result) checksum_data = result.checksum.dump(limit=300) num_raw_storys = len(raw_result.storys) warnings = None if raw_result.warnings: warnings = '; '.join(raw_result.warnings) del raw_result # release memory in advance msg = "feed url=%r storys=%s changed_storys=%s" LOG.info(msg, feed.url, num_raw_storys, len(result.storys)) feed.title = result.feed['title'] feed.link = result.feed['home_url'] feed.author = result.feed['author_name'] feed.icon = result.feed['icon_url'] feed.description = result.feed['description'] feed.dt_updated = result.feed['dt_updated'] feed.version = result.feed['version'] feed.storys = _get_storys(result.storys) feed.checksum_data = checksum_data feed.warnings = warnings del result # release memory in advance return validate_feed(feed)
def test_parse_large_content(template_name, content_length, summary_length): content_snip = "<span>12345678</span>" summary_snip = '<span>123</span>' content_repeat = (content_length // len(content_snip)) + 1 content = content_snip * content_repeat summary_repeat = (summary_length // len(summary_snip)) + 1 summary = summary_snip * summary_repeat template = large_feed_templates[template_name] # use replace instead format to avoid KeyError for json string data = template\ .replace('${content}', content)\ .replace('${summary}', summary)\ .encode('utf-8') response = _create_builder(content=data).build() raw_result = RawFeedParser().parse(response) assert raw_result and len(raw_result.storys) == 1 assert len(raw_result.storys[0]['content']) <= _RAW_MAX_CONTENT_LENGTH assert len(raw_result.storys[0]['summary']) <= _RAW_MAX_SUMMARY_LENGTH result = FeedParser().parse(raw_result) assert result and len(result.storys) == 1 assert len(result.storys[0]['content']) <= _MAX_CONTENT_LENGTH assert len(result.storys[0]['summary']) <= _MAX_SUMMARY_LENGTH