示例#1
0
    def test_story_dt_and_content_length(self):
        dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc)
        story = {
            'unique_id': f'blog.example.com/1',
            'title': f'test story 1',
            'dt_published': dt,
            'dt_updated': dt,
        }
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           [validate_story(story)],
                                           batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(0)
        dt_created = modified[0].dt_created
        dt_published = modified[0].dt_published
        assert modified[0].dt_updated == dt

        dt = dt + timezone.timedelta(days=1)
        updated_content = 'updated_content 1'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)

        dt = dt + timezone.timedelta(days=2)
        updated_content = 'updated_content 22'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)
示例#2
0
def _get_storys(entries: list):
    storys = []
    now = timezone.now()
    for data in entries:
        story = {}
        content = data['content']
        summary = data['summary']
        title = data['title']
        story['has_mathjax'] = data['has_mathjax']
        story['link'] = data['url']
        story['image_url'] = data['image_url']
        story['audio_url'] = data['audio_url']
        story['iframe_url'] = data['iframe_url']
        story['summary'] = summary
        story['content'] = content
        story['sentence_count'] = _compute_sentence_count(content)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['unique_id'] = data['ident']
        story['author'] = data["author_name"]
        dt_published = data['dt_published']
        dt_updated = data['dt_updated']
        story['dt_published'] = min(dt_published or dt_updated or now, now)
        story['dt_updated'] = min(dt_updated or dt_published or now, now)
        storys.append(story)
    return storys
示例#3
0
文件: rss.py 项目: jinofhust/rssant
def _parse_found(parsed):
    feed = AttrDict()
    res = parsed.response
    feed.use_proxy = parsed.use_proxy
    feed.url = _get_url(res)
    feed.content_length = len(res.content)
    feed.content_hash_base64 = compute_hash_base64(res.content)
    parsed_feed = parsed.feed
    feed.title = shorten(parsed_feed["title"], 200)
    link = parsed_feed["link"]
    if not link.startswith('http'):
        # 有些link属性不是URL,用author_detail的href代替
        # 例如:'http://www.cnblogs.com/grenet/'
        author_detail = parsed_feed['author_detail']
        if author_detail:
            link = author_detail['href']
    if not link.startswith('http'):
        link = feed.url
    feed.link = link
    feed.author = shorten(parsed_feed["author"], 200)
    feed.icon = parsed_feed["icon"] or parsed_feed["logo"]
    feed.description = parsed_feed["description"] or parsed_feed["subtitle"]
    feed.dt_updated = _get_dt_updated(parsed_feed)
    feed.etag = _get_etag(res)
    feed.last_modified = _get_last_modified(res)
    feed.encoding = res.encoding
    feed.version = shorten(parsed.version, 200)
    entries = list(parsed.entries)  # entries will be modified by _get_storys
    del parsed, res, parsed_feed  # release memory in advance
    feed.storys = _get_storys(entries)
    return validate_feed(feed)
示例#4
0
def _parse_found(found):
    response: FeedResponse
    raw_result: RawFeedResult
    response, raw_result = found
    feed = AttrDict()

    # feed response
    feed.use_proxy = response.use_proxy
    feed.url = response.url
    feed.content_length = len(response.content)
    feed.content_hash_base64 = compute_hash_base64(response.content)
    feed.etag = response.etag
    feed.last_modified = response.last_modified
    feed.encoding = response.encoding
    del found, response  # release memory in advance

    # parse feed and storys
    result = FeedParser().parse(raw_result)
    del raw_result  # release memory in advance

    feed.title = result.feed['title']
    feed.link = result.feed['home_url']
    feed.author = result.feed['author_name']
    feed.icon = result.feed['icon_url']
    feed.description = result.feed['description']
    feed.dt_updated = result.feed['dt_updated']
    feed.version = result.feed['version']
    feed.storys = _get_storys(result.storys)
    del result  # release memory in advance

    return validate_feed(feed)
示例#5
0
文件: rss.py 项目: XZYCR7/rssant
def do_sync_feed(
    ctx: ActorContext,
    feed_id: T.int,
    url: T.url,
    content_hash_base64: T.str.optional,
    etag: T.str.optional,
    last_modified: T.str.optional,
):
    params = dict(etag=etag, last_modified=last_modified)
    with FeedReader() as reader:
        status_code, response = reader.read(url, **params)
    LOG.info(f'read feed#{feed_id} url={unquote(url)} status_code={status_code}')
    if status_code != 200 or not response:
        return
    new_hash = compute_hash_base64(response.content)
    if new_hash == content_hash_base64:
        LOG.info(f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!')
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    parsed = FeedParser.parse_response(response)
    if parsed.bozo:
        LOG.warning(f'failed parse feed#{feed_id} url={unquote(url)}: {parsed.bozo_exception}')
        return
    try:
        feed = _parse_found(parsed)
    except Invalid as ex:
        LOG.warning(f'invalid feed#{feed_id} url={unquote(url)}: {ex}', exc_info=ex)
        return
    ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
示例#6
0
文件: rss.py 项目: yaowanyx/rssant
def get_story_of_feed_entry(data, now=None):
    """
    将 feedlib.FeedResult 的内容,转成 models.Feed 需要的数据
    """
    if now is None:
        now = timezone.now()
    story = {}
    content = data['content']
    summary = data['summary']
    title = data['title']
    story['has_mathjax'] = data['has_mathjax']
    story['link'] = data['url']
    story['image_url'] = data['image_url']
    story['audio_url'] = data['audio_url']
    story['iframe_url'] = data['iframe_url']
    story['summary'] = summary
    story['content'] = content
    story['sentence_count'] = _compute_sentence_count(content)
    content_hash_base64 = compute_hash_base64(content, summary, title)
    story['title'] = title
    story['content_hash_base64'] = content_hash_base64
    story['unique_id'] = data['ident']
    story['author'] = data["author_name"]
    dt_published = data['dt_published']
    dt_updated = data['dt_updated']
    story['dt_published'] = min(dt_published or dt_updated or now, now)
    story['dt_updated'] = min(dt_updated or dt_published or now, now)
    return story
示例#7
0
    def setUp(self):
        print('setUp')
        storys = []
        updated_storys = []
        now = timezone.datetime(2020, 6, 1, 12, 12, 12, tzinfo=timezone.utc)
        for i in range(200):
            dt = now + timezone.timedelta(minutes=i)
            content = f'test story content {i}' * (i % 5)
            content_hash_base64 = compute_hash_base64(content)
            summary = content[:30]
            story = {
                'unique_id': f'blog.example.com/{i}',
                'title': f'test story {i}',
                'content_hash_base64': content_hash_base64,
                'author': 'tester',
                'link': f'https://blog.example.com/{i}.html',
                'dt_published': dt,
                'dt_updated': dt,
                'summary': summary,
                'content': content,
            }
            storys.append(validate_story(story))
            updated_story = dict(story)
            updated_content = f'test story content updated {i}' * (i % 5 + 1)
            updated_story.update(
                content=updated_content,
                content_hash_base64=compute_hash_base64(updated_content),
            )
            updated_storys.append(validate_story(updated_story))
        self.storys = storys
        self.updated_storys = updated_storys

        feed = Feed(
            title='test feed',
            url='https://blog.example.com/feed.xml',
            status=FeedStatus.READY,
            dt_updated=timezone.now(),
        )
        feed.save()
        self.feed_id = feed.id
示例#8
0
文件: rss.py 项目: jinofhust/rssant
def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        link = normlize_url(data["link"])
        valid_link = ''
        if link:
            try:
                valid_link = validate_url(link)
            except Invalid:
                LOG.warning(f'invalid story link {link!r}')
        story['link'] = valid_link
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story link=%r content length=%s, will only save plain text!'
            LOG.warning(msg, link, len(content))
            content = story_html_to_text(content)
        content = process_story_links(content, valid_link)
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        title = shorten(data["title"] or link or summary, 200)
        unique_id = shorten(data['id'] or link or title, 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['unique_id'] = unique_id
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)
示例#9
0
def do_sync_feed(
    ctx: ActorContext,
    feed_id: T.int,
    url: T.url,
    use_proxy: T.bool.default(False),
    content_hash_base64: T.str.optional,
    etag: T.str.optional,
    last_modified: T.str.optional,
):
    params = dict(etag=etag, last_modified=last_modified, use_proxy=use_proxy)
    options = _get_proxy_options()
    options.update(allow_private_address=CONFIG.allow_private_address)
    with FeedReader(**options) as reader:
        response = reader.read(url, **params)
    LOG.info(
        f'read feed#{feed_id} url={unquote(url)} response.status={response.status}'
    )
    if response.status != 200 or not response.content:
        return
    new_hash = compute_hash_base64(response.content)
    if new_hash == content_hash_base64:
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
        return
    try:
        feed = _parse_found((response, raw_result))
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        return
    ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
示例#10
0
def _parse_found(found, checksum_data=None, is_refresh=False):
    response: FeedResponse
    raw_result: RawFeedResult
    response, raw_result = found
    feed = AttrDict()

    # feed response
    feed.use_proxy = response.use_proxy
    feed.url = response.url
    feed.content_length = len(response.content)
    feed.content_hash_base64 = compute_hash_base64(response.content)
    feed.etag = response.etag
    feed.last_modified = response.last_modified
    feed.encoding = response.encoding
    feed.response_status = response.status
    del found, response  # release memory in advance

    # parse feed and storys
    checksum = None
    if checksum_data and (not is_refresh):
        checksum = FeedChecksum.load(checksum_data)
    result = FeedParser(checksum=checksum).parse(raw_result)
    checksum_data = result.checksum.dump(limit=300)
    num_raw_storys = len(raw_result.storys)
    warnings = None
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
    del raw_result  # release memory in advance
    msg = "feed url=%r storys=%s changed_storys=%s"
    LOG.info(msg, feed.url, num_raw_storys, len(result.storys))

    feed.title = result.feed['title']
    feed.link = result.feed['home_url']
    feed.author = result.feed['author_name']
    feed.icon = result.feed['icon_url']
    feed.description = result.feed['description']
    feed.dt_updated = result.feed['dt_updated']
    feed.version = result.feed['version']
    feed.storys = _get_storys(result.storys)
    feed.checksum_data = checksum_data
    feed.warnings = warnings
    del result  # release memory in advance

    return validate_feed(feed)
示例#11
0
def query_old_storys_by_feed(feed_id):
    sql = """
    SELECT unique_id, title, link, author, dt_published, dt_updated, summary, content
    FROM rssant_api_story_bak
    WHERE feed_id=%s
    """
    fields = [
        'unique_id', 'title', 'link', 'author', 'dt_published', 'dt_updated',
        'summary', 'content'
    ]
    storys = []
    with connection.cursor() as cursor:
        cursor.execute(sql, [feed_id])
        for row in cursor.fetchall():
            story = dict(zip(fields, row))
            story['content_hash_base64'] = compute_hash_base64(
                story['content'], story['summary'], story['title'])
            storys.append(story)
    return storys
示例#12
0
文件: rss.py 项目: XZYCR7/rssant
def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        story['unique_id'] = shorten(_get_story_unique_id(data), 200)
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        content = story_html_clean(content)
        content = process_story_links(content, data["link"])
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        # TODO: performance
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        story['link'] = data["link"]
        title = shorten(data["title"] or story['link'] or story['unique_id'], 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)
示例#13
0
def do_sync_feed(
        ctx: ActorContext,
        feed_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        checksum_data: T.bytes.maxlen(4096).optional,
        content_hash_base64: T.str.optional,
        etag: T.str.optional,
        last_modified: T.str.optional,
        is_refresh: T.bool.default(False),
):
    params = {}
    if not is_refresh:
        params = dict(etag=etag, last_modified=last_modified)
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    switch_prob = 0.25  # the prob of switch from use proxy to not use proxy
    with FeedReader(**options) as reader:
        use_proxy = reader.has_proxy and use_proxy
        if use_proxy and random.random() < switch_prob:
            use_proxy = False
        response = reader.read(url, **params, use_proxy=use_proxy)
        LOG.info(
            f'read feed#{feed_id} url={unquote(url)} status={response.status}')
        need_proxy = FeedResponseStatus.is_need_proxy(response.status)
        if (not use_proxy) and reader.has_proxy and need_proxy:
            LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}')
            proxy_response = reader.read(url, **params, use_proxy=True)
            LOG.info(
                f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}'
            )
            if proxy_response.ok:
                response = proxy_response
    if (not response.ok) or (not response.content):
        status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR
        _update_feed_info(ctx, feed_id, status=status, response=response)
        return
    new_hash = compute_hash_base64(response.content)
    if (not is_refresh) and (new_hash == content_hash_base64):
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        _update_feed_info(ctx, feed_id, response=response)
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
    try:
        feed = _parse_found((response, raw_result),
                            checksum_data=checksum_data,
                            is_refresh=is_refresh)
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    ctx.tell('harbor_rss.update_feed',
             dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
示例#14
0
文件: helper.py 项目: yutaoxu/rssant
 def is_modified(self, content_hash_base64=None, fields=None):
     if content_hash_base64 is None and fields:
         content_hash_base64 = compute_hash_base64(*fields)
     if content_hash_base64 is not None:
         return content_hash_base64 != self.content_hash_base64
     return True