Python story_html_to_text示例，rssant_feedlib.processor.story_html_to_text Python示例

示例#1

0

显示文件

def do_update_story(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
    content: T.str,
    summary: T.str,
    has_mathjax: T.bool.optional,
    url: T.url,
):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    if not story:
        LOG.error('story#%s,%s not found', feed_id, offset)
        return
    if not is_fulltext_content(content):
        story_text = processor.story_html_to_text(story.content)
        text = processor.story_html_to_text(content)
        if not is_summary(story_text, text):
            msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content'
            LOG.info(msg, feed_id, offset, url)
            return
    data = dict(
        link=url,
        content=content,
        summary=summary,
        has_mathjax=has_mathjax,
    )
    STORY_SERVICE.update_story(feed_id, offset, data)
    _detect_story_images(ctx, story)

示例#2

0

显示文件

def do_update_story(
    ctx: ActorContext,
    story_id: T.int,
    content: T.str,
    summary: T.str,
    has_mathjax: T.bool.optional,
    url: T.url,
):
    story = Story.objects.get(pk=story_id)
    if not is_fulltext_content(content):
        story_text = processor.story_html_to_text(story.content)
        text = processor.story_html_to_text(content)
        if not is_summary(story_text, text):
            msg = 'fetched story#%s url=%r is not fulltext of feed story content'
            LOG.info(msg, story_id, url)
            return
    with transaction.atomic():
        story.refresh_from_db()
        story.link = url
        story.content = content
        story.summary = summary
        if has_mathjax is not None:
            story.has_mathjax = has_mathjax
        story.save()
    _detect_story_images(ctx, story)

示例#3

0

显示文件

文件： rss.py 项目： jinofhust/rssant

def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        link = normlize_url(data["link"])
        valid_link = ''
        if link:
            try:
                valid_link = validate_url(link)
            except Invalid:
                LOG.warning(f'invalid story link {link!r}')
        story['link'] = valid_link
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story link=%r content length=%s, will only save plain text!'
            LOG.warning(msg, link, len(content))
            content = story_html_to_text(content)
        content = process_story_links(content, valid_link)
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        title = shorten(data["title"] or link or summary, 200)
        unique_id = shorten(data['id'] or link or title, 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['unique_id'] = unique_id
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)

示例#4

0

显示文件

async def do_fetch_story(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        num_sub_sentences: T.int.optional,
):
    LOG.info(f'fetch story#{story_id} url={unquote(url)} begin')
    options = _get_proxy_options()
    options.update(allow_private_address=CONFIG.allow_private_address)
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    async with AsyncFeedReader(**options) as reader:
        use_proxy = use_proxy and reader.has_rss_proxy
        url_content = await _fetch_story(reader,
                                         story_id,
                                         url,
                                         use_proxy=use_proxy)
    if not url_content:
        return
    url, content = url_content
    if len(content) >= _MAX_STORY_HTML_LENGTH:
        content = story_html_clean(content)
        if len(content) >= _MAX_STORY_HTML_LENGTH:
            msg = 'too large story#%s size=%s url=%r'
            LOG.warning(msg, story_id, len(content), url)
            content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH]
    await ctx.hope(
        'worker_rss.process_story_webpage',
        dict(
            story_id=story_id,
            url=url,
            text=content,
            num_sub_sentences=num_sub_sentences,
        ))

示例#5

0

显示文件

文件： rss.py 项目： jinofhust/rssant

def do_process_story_webpage(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        text: T.str.maxlen(5 * 1024 * 1024),
):
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    text = text.strip()
    if not text:
        return
    content = story_readability(text)
    content = process_story_links(content, url)
    summary = shorten(story_html_to_text(content), width=300)
    if not summary:
        return
    ctx.hope(
        'harbor_rss.update_story',
        dict(
            story_id=story_id,
            content=content,
            summary=summary,
            url=url,
        ))

示例#6

0

显示文件

文件： rss.py 项目： jinofhust/rssant

async def do_fetch_story(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
):
    LOG.info(f'fetch story#{story_id} url={unquote(url)} begin')
    async with AsyncFeedReader(**_get_proxy_options()) as reader:
        use_proxy = use_proxy and reader.has_rss_proxy
        status, response = await reader.read(url, use_proxy=use_proxy)
    if response and response.url:
        url = str(response.url)
    LOG.info(
        f'fetch story#{story_id} url={unquote(url)} status={status} finished')
    if not (response and status == 200):
        return
    if not response.rssant_text:
        msg = 'story#%s url=%s response text is empty!'
        LOG.error(msg, story_id, unquote(url))
        return
    content = response.rssant_text
    if len(content) >= 1024 * 1024:
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story#%s size=%s url=%r'
            LOG.warning(msg, story_id, len(content), url)
            content = story_html_to_text(content)
    await ctx.hope('worker_rss.process_story_webpage',
                   dict(
                       story_id=story_id,
                       url=url,
                       text=content,
                   ))

示例#7

0

显示文件

def do_process_story_webpage(
    ctx: ActorContext,
    story_id: T.int,
    url: T.url,
    text: T.str.maxlen(_MAX_STORY_HTML_LENGTH),
    num_sub_sentences: T.int.optional,
):
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    text = text.strip()
    if not text:
        return
    text = story_html_clean(text)
    content = story_readability(text)
    content = process_story_links(content, url)
    if len(content) > _MAX_STORY_CONTENT_LENGTH:
        msg = 'too large story#%s size=%s url=%r, will only save plain text'
        LOG.warning(msg, story_id, len(content), url)
        content = shorten(story_html_to_text(content),
                          width=_MAX_STORY_CONTENT_LENGTH)
    # 如果取回的内容比RSS内容更短，就不是正确的全文
    if num_sub_sentences is not None:
        if not is_fulltext_content(content):
            num_sentences = len(split_sentences(story_html_to_text(content)))
            if num_sentences <= num_sub_sentences:
                msg = 'fetched story#%s url=%s num_sentences=%s less than num_sub_sentences=%s'
                LOG.info(msg, story_id, url, num_sentences, num_sub_sentences)
                return
    summary = shorten(story_html_to_text(content),
                      width=_MAX_STORY_SUMMARY_LENGTH)
    if not summary:
        return
    ctx.hope(
        'harbor_rss.update_story',
        dict(
            story_id=story_id,
            content=content,
            summary=summary,
            url=url,
        ))

示例#8

0

显示文件

async def do_fetch_story(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int.min(0),
    url: T.url,
    use_proxy: T.bool.default(False),
    num_sub_sentences: T.int.optional,
) -> SCHEMA_FETCH_STORY_RESULT:
    LOG.info(f'fetch story#{feed_id},{offset} url={unquote(url)} begin')
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    # make timeout less than actor default 30s to avoid ask timeout
    options.update(request_timeout=25)
    async with AsyncFeedReader(**options) as reader:
        use_proxy = use_proxy and reader.has_proxy
        url, content, response = await _fetch_story(reader,
                                                    feed_id,
                                                    offset,
                                                    url,
                                                    use_proxy=use_proxy)
    DEFAULT_RESULT = dict(feed_id=feed_id,
                          offset=offset,
                          url=url,
                          response_status=response.status,
                          use_proxy=response.use_proxy)
    if not content:
        return DEFAULT_RESULT
    if len(content) >= _MAX_STORY_HTML_LENGTH:
        content = story_html_clean(content)
        if len(content) >= _MAX_STORY_HTML_LENGTH:
            msg = 'too large story#%s,%s size=%s url=%r'
            LOG.warning(msg, feed_id, offset, len(content), url)
            content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH]
    msg_func = ctx.ask if ctx.message.is_ask else ctx.hope
    result = await msg_func(
        'worker_rss.process_story_webpage',
        dict(
            feed_id=feed_id,
            offset=offset,
            url=url,
            text=content,
            num_sub_sentences=num_sub_sentences,
        ))
    if not ctx.message.is_ask:
        return DEFAULT_RESULT
    result.update(DEFAULT_RESULT)
    return result

示例#9

0

显示文件

async def do_fetch_story(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
):
    LOG.info(f'fetch story#{story_id} url={unquote(url)} begin')
    options = _get_proxy_options()
    options.update(allow_private_address=CONFIG.allow_private_address)
    async with AsyncFeedReader(**options) as reader:
        use_proxy = use_proxy and reader.has_rss_proxy
        response = await reader.read(url, use_proxy=use_proxy)
    if response and response.url:
        url = str(response.url)
    LOG.info(
        f'fetch story#{story_id} url={unquote(url)} status={response.status} finished'
    )
    if not (response and response.ok):
        return
    if not response.content:
        msg = 'story#%s url=%s response text is empty!'
        LOG.error(msg, story_id, unquote(url))
        return
    try:
        content = response.content.decode(response.encoding)
    except UnicodeDecodeError as ex:
        LOG.warning('fetch story unicode decode error=%s url=%r', ex, url)
        content = response.content.decode(response.encoding, errors='ignore')
    if len(content) >= 1024 * 1024:
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story#%s size=%s url=%r'
            LOG.warning(msg, story_id, len(content), url)
            content = story_html_to_text(content)
    await ctx.hope('worker_rss.process_story_webpage',
                   dict(
                       story_id=story_id,
                       url=url,
                       text=content,
                   ))

示例#10

0

显示文件

文件： rss.py 项目： XZYCR7/rssant

def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        story['unique_id'] = shorten(_get_story_unique_id(data), 200)
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        content = story_html_clean(content)
        content = process_story_links(content, data["link"])
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        # TODO: performance
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        story['link'] = data["link"]
        title = shorten(data["title"] or story['link'] or story['unique_id'], 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)

示例#11

0

显示文件

def _compute_sentence_count(content: str) -> int:
    return len(split_sentences(story_html_to_text(content)))

示例#12

0

显示文件

def do_update_feed(
        ctx: ActorContext,
        feed_id: T.int,
        feed: FeedSchema,
        is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            # FIXME: feed merge 无法正确处理订阅重定向问题。
            # 对于这种情况，暂时保留旧的订阅，以后再彻底解决。
            # if target_feed:
            #     LOG.info(f'merge feed#{feed.id} url={feed.url} into '
            #              f'feed#{target_feed.id} url={target_feed.url}')
            #     target_feed.merge(feed)
            #     return
            if target_feed:
                LOG.warning(
                    f'FIXME: redirect feed#{feed.id} url={feed.url!r} into '
                    f'feed#{target_feed.id} url={target_feed.url!r}')
                feed_dict.pop('url')
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if (v != '' and v is not None) or k in {'warnings'}:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.reverse_url = reverse_url(feed.url)
        feed.status = FeedStatus.READY
        feed.save()
    # save storys, bulk_save_by_feed has standalone transaction
    for s in storys:
        if not s['dt_updated']:
            s['dt_updated'] = now
        if not s['dt_published']:
            # set dt_published to now - 30d to avoid these storys
            # take over mushroom page, i.e. Story.query_recent_by_user
            s['dt_published'] = now_sub_30d
    modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id,
                                                      storys,
                                                      is_refresh=is_refresh)
    LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id,
             len(storys), len(modified_storys))
    feed = Feed.get_by_pk(feed_id)
    is_freezed = feed.freeze_level is None or feed.freeze_level > 1
    if modified_storys and is_freezed:
        Feed.unfreeze_by_id(feed_id)
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not _is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell(
                'worker_rss.fetch_story',
                dict(
                    url=story.link,
                    use_proxy=feed.use_proxy,
                    feed_id=story.feed_id,
                    offset=story.offset,
                    num_sub_sentences=num_sub_sentences,
                ))

示例#13

0

显示文件

文件： union_story.py 项目： yaowanyx/rssant

 def summary(self):
     if self.dt_created < _DATE_LAST_HTML_SUMMARY:
         return story_html_to_text(self._story.summary)
     return self._story.summary

示例#14

0

显示文件

def convert_summary(summary):
    return story_html_to_text(summary)

示例#15

0

显示文件

def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if v != '' and v is not None:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                story_id=str(story.id),
                num_sub_sentences=num_sub_sentences,
            ))
        else:
            _detect_story_images(ctx, story)