예제 #1
0
def do_update_story(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
    content: T.str,
    summary: T.str,
    has_mathjax: T.bool.optional,
    url: T.url,
):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    if not story:
        LOG.error('story#%s,%s not found', feed_id, offset)
        return
    if not is_fulltext_content(content):
        story_text = processor.story_html_to_text(story.content)
        text = processor.story_html_to_text(content)
        if not is_summary(story_text, text):
            msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content'
            LOG.info(msg, feed_id, offset, url)
            return
    data = dict(
        link=url,
        content=content,
        summary=summary,
        has_mathjax=has_mathjax,
    )
    STORY_SERVICE.update_story(feed_id, offset, data)
    _detect_story_images(ctx, story)
예제 #2
0
def do_sync_story_fulltext(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
) -> T.dict(
        feed_id=T.int,
        offset=T.int.min(0),
        use_proxy=T.bool,
        url=T.url,
        response_status=T.int,
        accept=T_ACCEPT,
):
    with log_django_context_metric('harbor_rss.sync_story_fulltext:read'):
        feed = Feed.get_by_pk(feed_id, detail='+use_proxy')
        story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    assert story, f'story#{feed_id},{offset} not found'
    story_content_info = StoryContentInfo(story.content)
    num_sub_sentences = len(split_sentences(story_content_info.text))
    ret = dict(
        feed_id=feed_id,
        offset=offset,
        url=story.link,
        use_proxy=feed.use_proxy,
        accept=FulltextAcceptStrategy.REJECT.value,
    )
    try:
        result = ctx.ask(
            'worker_rss.fetch_story',
            dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                feed_id=feed_id,
                offset=offset,
                num_sub_sentences=num_sub_sentences,
            ))
    except _TIMEOUT_ERRORS as ex:
        LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}')
        ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT)
        return ret
    else:
        ret.update(
            response_status=result['response_status'],
            use_proxy=result['use_proxy'],
        )
        if not result['content']:
            return ret
    with log_django_context_metric('harbor_rss.sync_story_fulltext:write'):
        accept = _update_story(
            story=story,
            story_content_info=story_content_info,
            content=result['content'],
            summary=None,  # not need update summary
            url=result['url'],
            sentence_count=result['sentence_count'],
        )
        ret.update(accept=accept.value)
    return ret
예제 #3
0
def do_update_story(ctx: ActorContext, feed_id: T.int, offset: T.int,
                    content: T.str, summary: T.str,
                    has_mathjax: T.bool.optional, url: T.url,
                    response_status: T.int.optional,
                    sentence_count: T.int.min(0).optional):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    if not story:
        LOG.error('story#%s,%s not found', feed_id, offset)
        return
    _update_story(
        story=story,
        story_content_info=StoryContentInfo(story.content),
        content=content,
        summary=summary,
        url=url,
        has_mathjax=has_mathjax,
        sentence_count=sentence_count,
    )
예제 #4
0
def _replace_story_images(feed_id, offset):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    image_processor = StoryImageProcessor(story.link, story.content)
    image_indexs = image_processor.parse()
    image_urls = _image_urls_of_indexs(image_indexs)
    if not image_urls:
        return
    image_statuses = ImageInfo.batch_detect_images(image_urls)
    image_replaces = {}
    for url, status in image_statuses.items():
        if status in IMAGE_REFERER_DENY_STATUS:
            new_url_data = encode_image_url(url, story.link)
            image_replaces[url] = '/api/v1/image/{}?{}'.format(new_url_data, RSSANT_IMAGE_TAG)
    LOG.info(f'story#{feed_id},{offset} {story.link} '
             f'replace {len(image_replaces)} referer deny images')
    # image_processor.process will (1) fix relative url (2) replace image url
    # call image_processor.process regardless of image_replaces is empty or not
    content = image_processor.process(image_indexs, image_replaces)
    STORY_SERVICE.update_story(feed_id, offset, {'content': content})