示例#1
0
def test_raw_parse_bad_encoding():
    content = os.urandom(16 * 1024)
    response = _create_builder(content=content).build()
    parser = RawFeedParser()
    with pytest.raises(FeedParserError) as ex:
        parser.parse(response)
    assert ex
示例#2
0
def test_raw_parse_warn(filename):
    response = _read_response(_data_dir / 'warn', filename)
    parser = RawFeedParser()
    result = parser.parse(response)
    assert result
    assert result.warnings and isinstance(result.warnings, list)
    assert result.storys
    assert result.feed['version']
    assert result.feed['title']
示例#3
0
def test_raw_parse_bad_encoding():
    content = os.urandom(16 * 1024)
    builder = FeedResponseBuilder()
    builder.url('https://blog.example.com/feed')
    builder.content(content)
    response = builder.build()
    parser = RawFeedParser()
    with pytest.raises(FeedParserError) as ex:
        parser.parse(response)
    assert ex
示例#4
0
def test_parser_and_checksum(filepath):
    response = _read_response(_data_dir, filepath)
    raw_parser = RawFeedParser()
    raw_result = raw_parser.parse(response)
    assert raw_result.feed
    assert raw_result.storys
    parser = FeedParser()
    result = parser.parse(raw_result)
    assert result.feed
    assert result.storys
    assert result.checksum.size() == len(result.storys)
示例#5
0
def test_raw_parser_incomplete_content():
    response = _read_response(_data_dir / 'warn', 'https-tmioe-com-feed.xml')
    parser = RawFeedParser()
    result = parser.parse(response)
    assert len(result.storys) == 5
    assert result.feed['version'] == 'rss20'
    assert result.feed['title'] == 'ZAPRO · 杂铺'
    expect_title = "TikTok 抖音国际版 v18.6.2 解锁全部国家任意切换"
    expect_url = "https://tmioe.com/1463.html"
    got_storys = [x for x in result.storys if x['url'] == expect_url]
    assert got_storys
    assert got_storys[0]['title'] == expect_title
示例#6
0
def test_parse_story_no_id_no_summary_no_url():
    # total 3 storys
    # skip the no id story
    # story#0 no content no summary, no url
    # story#1 has content no summary, no url but id is valid url
    filename = 'well/v2ex-no-id-no-summary-no-url.xml'
    response = _read_response(_data_dir, filename)

    raw_result = RawFeedParser().parse(response)
    assert raw_result.storys
    # assert skip the no id story
    assert len(raw_result.storys) == 2
    # assert no summary
    assert not raw_result.storys[0]['summary']
    assert not raw_result.storys[1]['summary']
    # assert content
    assert not raw_result.storys[0]['content']
    assert raw_result.storys[1]['content']
    # assert pick id as url, discard the invalid one
    assert not raw_result.storys[0]['url']
    assert raw_result.storys[1]['url']

    result = FeedParser().parse(raw_result)
    assert result.storys
    assert len(raw_result.storys) == len(result.storys)
    # assert content
    assert not result.storys[0]['content']
    assert result.storys[1]['content']
    # assert extract summary from content
    assert not result.storys[0]['summary']
    assert result.storys[1]['summary']
    # assert pick id as url, discard the invalid one
    assert not result.storys[0]['url']
    assert result.storys[1]['url']
示例#7
0
def test_parse_too_many_storys():
    items = []
    num_storys = 2000
    base = datetime.datetime.now()
    for i in range(num_storys):
        if i < num_storys // 2:
            date_published = None
        else:
            date_published = (base + datetime.timedelta(seconds=i)).isoformat()
        items.append({
            "id": f"{i}",
            "content_html": f"content_{i}",
            "summary": f"summary_{i}",
            "url": f"https://example.org/post/{i}",
            "date_published": date_published,
        })
    feed = {
        "version": "https://jsonfeed.org/version/1",
        "title": "Too many storys",
        "home_page_url": "https://example.org/",
        "feed_url": "https://example.org/feed.json",
        "items": items
    }
    data = json.dumps(feed).encode('utf-8')
    response = _create_builder(data).build()
    raw_result = RawFeedParser().parse(response)
    assert len(raw_result.storys) == num_storys
    result = FeedParser().parse(raw_result)
    assert len(result.storys) == _MAX_STORYS
    expected = set(range(num_storys - _MAX_STORYS, num_storys))
    story_ids = {int(x['ident']) for x in result.storys}
    assert story_ids == expected
示例#8
0
def _parse_well_feed(filename) -> FeedResult:
    response = _read_response(_data_dir / 'well', filename)
    raw_result = RawFeedParser().parse(response)
    assert raw_result.feed
    assert raw_result.storys
    assert not raw_result.warnings
    result = FeedParser().parse(raw_result)
    assert len(result.storys) == len(raw_result.storys)
    return result
示例#9
0
def do_sync_feed(
    ctx: ActorContext,
    feed_id: T.int,
    url: T.url,
    use_proxy: T.bool.default(False),
    content_hash_base64: T.str.optional,
    etag: T.str.optional,
    last_modified: T.str.optional,
):
    params = dict(etag=etag, last_modified=last_modified, use_proxy=use_proxy)
    options = _get_proxy_options()
    options.update(allow_private_address=CONFIG.allow_private_address)
    with FeedReader(**options) as reader:
        response = reader.read(url, **params)
    LOG.info(
        f'read feed#{feed_id} url={unquote(url)} response.status={response.status}'
    )
    if response.status != 200 or not response.content:
        return
    new_hash = compute_hash_base64(response.content)
    if new_hash == content_hash_base64:
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
        return
    try:
        feed = _parse_found((response, raw_result))
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        return
    ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
示例#10
0
def test_raw_parse_date_timestamp():
    content = '''
    <rss version="2.0">
    <channel>
    <title>博客中国</title>
    <generator>http://www.blogchina.com</generator>
    <item>
        <title><![CDATA[ 新能源车崛起,传统汽车如何避免诺基亚式危机? ]]></title>
        <link>http://jianghulaoliu.blogchina.com/956463775.html</link>
        <description><![CDATA[ 近日,特斯拉降价16万使得特斯拉消费激增 ]]></description>
        <source></source>
        <pubDate>1611146768</pubDate>
    </item>
    '''.encode('utf-8')
    response = _create_builder(content=content).build()
    result = RawFeedParser().parse(response)
    assert len(result.storys) == 1
    dt: datetime.datetime = result.storys[0]['dt_published']
    assert dt == datetime.datetime.fromtimestamp(1611146768, tz=UTC)
示例#11
0
def test_parse_large_content(template_name, content_length, summary_length):
    content_snip = "<span>12345678</span>"
    summary_snip = '<span>123</span>'
    content_repeat = (content_length // len(content_snip)) + 1
    content = content_snip * content_repeat
    summary_repeat = (summary_length // len(summary_snip)) + 1
    summary = summary_snip * summary_repeat
    template = large_feed_templates[template_name]
    # use replace instead format to avoid KeyError for json string
    data = template\
        .replace('${content}', content)\
        .replace('${summary}', summary)\
        .encode('utf-8')
    response = _create_builder(content=data).build()
    raw_result = RawFeedParser().parse(response)
    assert raw_result and len(raw_result.storys) == 1
    assert len(raw_result.storys[0]['content']) <= _RAW_MAX_CONTENT_LENGTH
    assert len(raw_result.storys[0]['summary']) <= _RAW_MAX_SUMMARY_LENGTH
    result = FeedParser().parse(raw_result)
    assert result and len(result.storys) == 1
    assert len(result.storys[0]['content']) <= _MAX_CONTENT_LENGTH
    assert len(result.storys[0]['summary']) <= _MAX_SUMMARY_LENGTH
示例#12
0
def do_sync_feed(
        ctx: ActorContext,
        feed_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
        checksum_data: T.bytes.maxlen(4096).optional,
        content_hash_base64: T.str.optional,
        etag: T.str.optional,
        last_modified: T.str.optional,
        is_refresh: T.bool.default(False),
):
    params = {}
    if not is_refresh:
        params = dict(etag=etag, last_modified=last_modified)
    options = _proxy_helper.get_proxy_options()
    if DNS_SERVICE.is_resolved_url(url):
        use_proxy = False
    switch_prob = 0.25  # the prob of switch from use proxy to not use proxy
    with FeedReader(**options) as reader:
        use_proxy = reader.has_proxy and use_proxy
        if use_proxy and random.random() < switch_prob:
            use_proxy = False
        response = reader.read(url, **params, use_proxy=use_proxy)
        LOG.info(
            f'read feed#{feed_id} url={unquote(url)} status={response.status}')
        need_proxy = FeedResponseStatus.is_need_proxy(response.status)
        if (not use_proxy) and reader.has_proxy and need_proxy:
            LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}')
            proxy_response = reader.read(url, **params, use_proxy=True)
            LOG.info(
                f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}'
            )
            if proxy_response.ok:
                response = proxy_response
    if (not response.ok) or (not response.content):
        status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR
        _update_feed_info(ctx, feed_id, status=status, response=response)
        return
    new_hash = compute_hash_base64(response.content)
    if (not is_refresh) and (new_hash == content_hash_base64):
        LOG.info(
            f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!'
        )
        _update_feed_info(ctx, feed_id, response=response)
        return
    LOG.info(f'parse feed#{feed_id} url={unquote(url)}')
    try:
        raw_result = RawFeedParser().parse(response)
    except FeedParserError as ex:
        LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url),
                    ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    if raw_result.warnings:
        warnings = '; '.join(raw_result.warnings)
        LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url),
                    warnings)
    try:
        feed = _parse_found((response, raw_result),
                            checksum_data=checksum_data,
                            is_refresh=is_refresh)
    except (Invalid, FeedParserError) as ex:
        LOG.error('invalid feed#%s url=%r: %s',
                  feed_id,
                  unquote(url),
                  ex,
                  exc_info=ex)
        _update_feed_info(ctx,
                          feed_id,
                          status=FeedStatus.ERROR,
                          response=response,
                          warnings=str(ex))
        return
    ctx.tell('harbor_rss.update_feed',
             dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
示例#13
0
def test_raw_parse_failed(filename):
    response = _read_response(_data_dir / 'failed', filename)
    parser = RawFeedParser()
    with pytest.raises(FeedParserError) as ex:
        parser.parse(response)
    assert ex