def test_story_dt_and_content_length(self): dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc) story = { 'unique_id': f'blog.example.com/1', 'title': f'test story 1', 'dt_published': dt, 'dt_updated': dt, } modified = Story.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(0) dt_created = modified[0].dt_created dt_published = modified[0].dt_published assert modified[0].dt_updated == dt dt = dt + timezone.timedelta(days=1) updated_content = 'updated_content 1' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content) dt = dt + timezone.timedelta(days=2) updated_content = 'updated_content 22' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content)
def _get_storys(entries: list): storys = [] now = timezone.now() for data in entries: story = {} content = data['content'] summary = data['summary'] title = data['title'] story['has_mathjax'] = data['has_mathjax'] story['link'] = data['url'] story['image_url'] = data['image_url'] story['audio_url'] = data['audio_url'] story['iframe_url'] = data['iframe_url'] story['summary'] = summary story['content'] = content story['sentence_count'] = _compute_sentence_count(content) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['unique_id'] = data['ident'] story['author'] = data["author_name"] dt_published = data['dt_published'] dt_updated = data['dt_updated'] story['dt_published'] = min(dt_published or dt_updated or now, now) story['dt_updated'] = min(dt_updated or dt_published or now, now) storys.append(story) return storys
def _parse_found(parsed): feed = AttrDict() res = parsed.response feed.use_proxy = parsed.use_proxy feed.url = _get_url(res) feed.content_length = len(res.content) feed.content_hash_base64 = compute_hash_base64(res.content) parsed_feed = parsed.feed feed.title = shorten(parsed_feed["title"], 200) link = parsed_feed["link"] if not link.startswith('http'): # 有些link属性不是URL,用author_detail的href代替 # 例如:'http://www.cnblogs.com/grenet/' author_detail = parsed_feed['author_detail'] if author_detail: link = author_detail['href'] if not link.startswith('http'): link = feed.url feed.link = link feed.author = shorten(parsed_feed["author"], 200) feed.icon = parsed_feed["icon"] or parsed_feed["logo"] feed.description = parsed_feed["description"] or parsed_feed["subtitle"] feed.dt_updated = _get_dt_updated(parsed_feed) feed.etag = _get_etag(res) feed.last_modified = _get_last_modified(res) feed.encoding = res.encoding feed.version = shorten(parsed.version, 200) entries = list(parsed.entries) # entries will be modified by _get_storys del parsed, res, parsed_feed # release memory in advance feed.storys = _get_storys(entries) return validate_feed(feed)
def _parse_found(found): response: FeedResponse raw_result: RawFeedResult response, raw_result = found feed = AttrDict() # feed response feed.use_proxy = response.use_proxy feed.url = response.url feed.content_length = len(response.content) feed.content_hash_base64 = compute_hash_base64(response.content) feed.etag = response.etag feed.last_modified = response.last_modified feed.encoding = response.encoding del found, response # release memory in advance # parse feed and storys result = FeedParser().parse(raw_result) del raw_result # release memory in advance feed.title = result.feed['title'] feed.link = result.feed['home_url'] feed.author = result.feed['author_name'] feed.icon = result.feed['icon_url'] feed.description = result.feed['description'] feed.dt_updated = result.feed['dt_updated'] feed.version = result.feed['version'] feed.storys = _get_storys(result.storys) del result # release memory in advance return validate_feed(feed)
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, ): params = dict(etag=etag, last_modified=last_modified) with FeedReader() as reader: status_code, response = reader.read(url, **params) LOG.info(f'read feed#{feed_id} url={unquote(url)} status_code={status_code}') if status_code != 200 or not response: return new_hash = compute_hash_base64(response.content) if new_hash == content_hash_base64: LOG.info(f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!') return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') parsed = FeedParser.parse_response(response) if parsed.bozo: LOG.warning(f'failed parse feed#{feed_id} url={unquote(url)}: {parsed.bozo_exception}') return try: feed = _parse_found(parsed) except Invalid as ex: LOG.warning(f'invalid feed#{feed_id} url={unquote(url)}: {ex}', exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
def get_story_of_feed_entry(data, now=None): """ 将 feedlib.FeedResult 的内容,转成 models.Feed 需要的数据 """ if now is None: now = timezone.now() story = {} content = data['content'] summary = data['summary'] title = data['title'] story['has_mathjax'] = data['has_mathjax'] story['link'] = data['url'] story['image_url'] = data['image_url'] story['audio_url'] = data['audio_url'] story['iframe_url'] = data['iframe_url'] story['summary'] = summary story['content'] = content story['sentence_count'] = _compute_sentence_count(content) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['unique_id'] = data['ident'] story['author'] = data["author_name"] dt_published = data['dt_published'] dt_updated = data['dt_updated'] story['dt_published'] = min(dt_published or dt_updated or now, now) story['dt_updated'] = min(dt_updated or dt_published or now, now) return story
def setUp(self): print('setUp') storys = [] updated_storys = [] now = timezone.datetime(2020, 6, 1, 12, 12, 12, tzinfo=timezone.utc) for i in range(200): dt = now + timezone.timedelta(minutes=i) content = f'test story content {i}' * (i % 5) content_hash_base64 = compute_hash_base64(content) summary = content[:30] story = { 'unique_id': f'blog.example.com/{i}', 'title': f'test story {i}', 'content_hash_base64': content_hash_base64, 'author': 'tester', 'link': f'https://blog.example.com/{i}.html', 'dt_published': dt, 'dt_updated': dt, 'summary': summary, 'content': content, } storys.append(validate_story(story)) updated_story = dict(story) updated_content = f'test story content updated {i}' * (i % 5 + 1) updated_story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), ) updated_storys.append(validate_story(updated_story)) self.storys = storys self.updated_storys = updated_storys feed = Feed( title='test feed', url='https://blog.example.com/feed.xml', status=FeedStatus.READY, dt_updated=timezone.now(), ) feed.save() self.feed_id = feed.id
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) link = normlize_url(data["link"]) valid_link = '' if link: try: valid_link = validate_url(link) except Invalid: LOG.warning(f'invalid story link {link!r}') story['link'] = valid_link content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story link=%r content length=%s, will only save plain text!' LOG.warning(msg, link, len(content)) content = story_html_to_text(content) content = process_story_links(content, valid_link) story['content'] = content summary = data["summary"] if not summary: summary = content summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary title = shorten(data["title"] or link or summary, 200) unique_id = shorten(data['id'] or link or title, 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['unique_id'] = unique_id story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, ): params = dict(etag=etag, last_modified=last_modified, use_proxy=use_proxy) options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) with FeedReader(**options) as reader: response = reader.read(url, **params) LOG.info( f'read feed#{feed_id} url={unquote(url)} response.status={response.status}' ) if response.status != 200 or not response.content: return new_hash = compute_hash_base64(response.content) if new_hash == content_hash_base64: LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) return try: feed = _parse_found((response, raw_result)) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
def _parse_found(found, checksum_data=None, is_refresh=False): response: FeedResponse raw_result: RawFeedResult response, raw_result = found feed = AttrDict() # feed response feed.use_proxy = response.use_proxy feed.url = response.url feed.content_length = len(response.content) feed.content_hash_base64 = compute_hash_base64(response.content) feed.etag = response.etag feed.last_modified = response.last_modified feed.encoding = response.encoding feed.response_status = response.status del found, response # release memory in advance # parse feed and storys checksum = None if checksum_data and (not is_refresh): checksum = FeedChecksum.load(checksum_data) result = FeedParser(checksum=checksum).parse(raw_result) checksum_data = result.checksum.dump(limit=300) num_raw_storys = len(raw_result.storys) warnings = None if raw_result.warnings: warnings = '; '.join(raw_result.warnings) del raw_result # release memory in advance msg = "feed url=%r storys=%s changed_storys=%s" LOG.info(msg, feed.url, num_raw_storys, len(result.storys)) feed.title = result.feed['title'] feed.link = result.feed['home_url'] feed.author = result.feed['author_name'] feed.icon = result.feed['icon_url'] feed.description = result.feed['description'] feed.dt_updated = result.feed['dt_updated'] feed.version = result.feed['version'] feed.storys = _get_storys(result.storys) feed.checksum_data = checksum_data feed.warnings = warnings del result # release memory in advance return validate_feed(feed)
def query_old_storys_by_feed(feed_id): sql = """ SELECT unique_id, title, link, author, dt_published, dt_updated, summary, content FROM rssant_api_story_bak WHERE feed_id=%s """ fields = [ 'unique_id', 'title', 'link', 'author', 'dt_published', 'dt_updated', 'summary', 'content' ] storys = [] with connection.cursor() as cursor: cursor.execute(sql, [feed_id]) for row in cursor.fetchall(): story = dict(zip(fields, row)) story['content_hash_base64'] = compute_hash_base64( story['content'], story['summary'], story['title']) storys.append(story) return storys
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} story['unique_id'] = shorten(_get_story_unique_id(data), 200) content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) content = story_html_clean(content) content = process_story_links(content, data["link"]) story['content'] = content summary = data["summary"] if not summary: summary = content # TODO: performance summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary story['link'] = data["link"] title = shorten(data["title"] or story['link'] or story['unique_id'], 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), checksum_data: T.bytes.maxlen(4096).optional, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, is_refresh: T.bool.default(False), ): params = {} if not is_refresh: params = dict(etag=etag, last_modified=last_modified) options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False switch_prob = 0.25 # the prob of switch from use proxy to not use proxy with FeedReader(**options) as reader: use_proxy = reader.has_proxy and use_proxy if use_proxy and random.random() < switch_prob: use_proxy = False response = reader.read(url, **params, use_proxy=use_proxy) LOG.info( f'read feed#{feed_id} url={unquote(url)} status={response.status}') need_proxy = FeedResponseStatus.is_need_proxy(response.status) if (not use_proxy) and reader.has_proxy and need_proxy: LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}') proxy_response = reader.read(url, **params, use_proxy=True) LOG.info( f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}' ) if proxy_response.ok: response = proxy_response if (not response.ok) or (not response.content): status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR _update_feed_info(ctx, feed_id, status=status, response=response) return new_hash = compute_hash_base64(response.content) if (not is_refresh) and (new_hash == content_hash_base64): LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) _update_feed_info(ctx, feed_id, response=response) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) try: feed = _parse_found((response, raw_result), checksum_data=checksum_data, is_refresh=is_refresh) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
def is_modified(self, content_hash_base64=None, fields=None): if content_hash_base64 is None and fields: content_hash_base64 = compute_hash_base64(*fields) if content_hash_base64 is not None: return content_hash_base64 != self.content_hash_base64 return True