def do_update_story( ctx: ActorContext, feed_id: T.int, offset: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, ): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) if not story: LOG.error('story#%s,%s not found', feed_id, offset) return if not is_fulltext_content(content): story_text = processor.story_html_to_text(story.content) text = processor.story_html_to_text(content) if not is_summary(story_text, text): msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content' LOG.info(msg, feed_id, offset, url) return data = dict( link=url, content=content, summary=summary, has_mathjax=has_mathjax, ) STORY_SERVICE.update_story(feed_id, offset, data) _detect_story_images(ctx, story)
def do_sync_story_fulltext( ctx: ActorContext, feed_id: T.int, offset: T.int, ) -> T.dict( feed_id=T.int, offset=T.int.min(0), use_proxy=T.bool, url=T.url, response_status=T.int, accept=T_ACCEPT, ): with log_django_context_metric('harbor_rss.sync_story_fulltext:read'): feed = Feed.get_by_pk(feed_id, detail='+use_proxy') story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) assert story, f'story#{feed_id},{offset} not found' story_content_info = StoryContentInfo(story.content) num_sub_sentences = len(split_sentences(story_content_info.text)) ret = dict( feed_id=feed_id, offset=offset, url=story.link, use_proxy=feed.use_proxy, accept=FulltextAcceptStrategy.REJECT.value, ) try: result = ctx.ask( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=feed_id, offset=offset, num_sub_sentences=num_sub_sentences, )) except _TIMEOUT_ERRORS as ex: LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}') ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT) return ret else: ret.update( response_status=result['response_status'], use_proxy=result['use_proxy'], ) if not result['content']: return ret with log_django_context_metric('harbor_rss.sync_story_fulltext:write'): accept = _update_story( story=story, story_content_info=story_content_info, content=result['content'], summary=None, # not need update summary url=result['url'], sentence_count=result['sentence_count'], ) ret.update(accept=accept.value) return ret
def do_update_story(ctx: ActorContext, feed_id: T.int, offset: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, response_status: T.int.optional, sentence_count: T.int.min(0).optional): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) if not story: LOG.error('story#%s,%s not found', feed_id, offset) return _update_story( story=story, story_content_info=StoryContentInfo(story.content), content=content, summary=summary, url=url, has_mathjax=has_mathjax, sentence_count=sentence_count, )
def _replace_story_images(feed_id, offset): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) image_processor = StoryImageProcessor(story.link, story.content) image_indexs = image_processor.parse() image_urls = _image_urls_of_indexs(image_indexs) if not image_urls: return image_statuses = ImageInfo.batch_detect_images(image_urls) image_replaces = {} for url, status in image_statuses.items(): if status in IMAGE_REFERER_DENY_STATUS: new_url_data = encode_image_url(url, story.link) image_replaces[url] = '/api/v1/image/{}?{}'.format(new_url_data, RSSANT_IMAGE_TAG) LOG.info(f'story#{feed_id},{offset} {story.link} ' f'replace {len(image_replaces)} referer deny images') # image_processor.process will (1) fix relative url (2) replace image url # call image_processor.process regardless of image_replaces is empty or not content = image_processor.process(image_indexs, image_replaces) STORY_SERVICE.update_story(feed_id, offset, {'content': content})