Exemplo n.º 1
0
def test_search_entries_order_content(reader):
    parser = Parser()
    reader._parser = parser

    feed = parser.feed(1, datetime(2010, 1, 1))
    one = parser.entry(
        1,
        1,
        datetime(2010, 1, 1),
        summary='word word',
        content=[
            Content('word'),
            Content('does not match'),
            Content('word word word word'),
            Content('word word word'),
        ],
    )

    reader.add_feed(feed.url)
    reader.update_feeds()
    reader.enable_search()
    reader.update_search()

    # there should be exactly one result
    (rv, ) = reader.search_entries('word')
    assert list(rv.content) == [
        '.content[2].value',
        '.content[3].value',
        '.summary',
        '.content[0].value',
    ]
Exemplo n.º 2
0
def reader_without_and_with_entries(request, reader):
    if not request.param:
        return reader

    parser = Parser()
    reader._parser = parser

    feed = parser.feed(1, datetime(2010, 1, 1))
    parser.entry(
        1,
        1,
        datetime(2010, 1, 1),
        title='feed one',
        summary='summary',
        content=[Content('content'),
                 Content('another content')],
    )
    parser.entry(1, 2, datetime(2010, 1, 1), title='feed one')
    parser.entry(1, 3, datetime(2010, 1, 1), title='feed one')
    parser.entry(1, 4, datetime(2010, 1, 1), title='feed one')
    parser.entry(1, 5, datetime(2010, 1, 1), title='feed one')

    reader.add_feed(feed.url)
    reader.update_feeds()

    return reader
Exemplo n.º 3
0
 def content(self):
     rv = []
     for path, highlight in self._search_result.content.items():
         # TODO: find a more correct way to match .content[0].value
         if path.startswith('.content[') and path.endswith('].value'):
             rv.append(Content(str(highlight), 'text/plain'))
             rv.append(Content(highlighted(highlight), 'text/html'))
     return rv
Exemplo n.º 4
0
def test_search_entries_order_weights(reader, chunk_size):
    """Entry title beats feed title beats entry content/summary."""

    # TODO: may need fixing once we finish tuning the weights (it should fail)

    reader._search.storage.chunk_size = chunk_size

    parser = Parser()
    reader._parser = parser

    feed_one = parser.feed(1, datetime(2010, 1, 1), title='one')
    entry_one = parser.entry(1, 1, datetime(2010, 1, 1))
    feed_two = parser.feed(2, datetime(2010, 1, 1), title='two')
    entry_two = parser.entry(2, 2, datetime(2010, 1, 1), title='one')
    entry_three = parser.entry(2,
                               3,
                               datetime(2010, 1, 1),
                               content=[Content('one')])
    entry_four = parser.entry(2, 4, datetime(2010, 1, 1), summary='one')
    entry_five = parser.entry(2,
                              5,
                              datetime(2010, 1, 1),
                              content=[Content('one')] * 2)
    entry_six = parser.entry(2,
                             6,
                             datetime(2010, 1, 1),
                             summary='one',
                             content=[Content('one')])
    entry_seven = parser.entry(2,
                               7,
                               datetime(2010, 1, 1),
                               title="does not match")

    reader.add_feed(feed_one.url)
    reader.add_feed(feed_two.url)
    reader.update_feeds()
    reader.enable_search()
    reader.update_search()

    rv = [(e.id, e.feed_url) for e in reader.search_entries('one')]

    assert rv[:2] == [(entry_two.id, feed_two.url),
                      (entry_one.id, feed_one.url)]

    # TODO: how do we check these have the same exact rank?
    assert sorted(rv[2:]) == [
        (entry_three.id, feed_two.url),
        (entry_four.id, feed_two.url),
        (entry_five.id, feed_two.url),
        (entry_six.id, feed_two.url),
    ]
Exemplo n.º 5
0
def test_update_search_concurrent_calls(db_path, monkeypatch):
    """Test concurrent calls to reader.update_search() don't interfere
    with one another.

    https://github.com/lemon24/reader/issues/175#issuecomment-652489019

    """
    # This is a very intrusive test, maybe we should move it somewhere else.

    reader = make_reader(db_path)
    parser = reader._parser = Parser()

    feed = parser.feed(1, datetime(2010, 1, 1), title='feed')
    parser.entry(
        1,
        1,
        datetime(2010, 1, 1),
        title='entry',
        summary='summary',
        content=[Content('content')],
    )
    reader.add_feed(feed.url)
    reader.update_feeds()
    reader.enable_search()

    barrier = threading.Barrier(2)

    def target():
        from reader._search import Search

        class MySearch(Search):
            @staticmethod
            def strip_html(*args, **kwargs):
                barrier.wait()
                return Search.strip_html(*args, **kwargs)

        # TODO: remove monkeypatching when make_reader() gets a search_cls argument
        monkeypatch.setattr('reader.core.Search', MySearch)

        reader = make_reader(db_path)
        reader.update_search()

    threads = [threading.Thread(target=target) for _ in range(2)]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

    (result, ) = reader.search_entries('entry')
    assert len(result.content) == 2

    ((rowcount, ),
     ) = reader._search.db.execute("select count(*) from entries_search;")
    assert rowcount == 2
Exemplo n.º 6
0
def test_search_entries_order_content_recent(reader):
    """When sort='recent' is used, the .content of any individual result
    should still be sorted by relevance.

    """
    parser = Parser()
    reader._parser = parser

    feed = parser.feed(1, datetime(2010, 1, 1))
    one = parser.entry(
        1,
        1,
        datetime(2010, 1, 1),
        title='word',
        content=[
            Content('word word'),
            Content('word'),
            Content('word word word')
        ],
    )
    two = parser.entry(1, 2, datetime(2010, 1, 2), summary='word')

    reader.add_feed(feed.url)
    reader.update_feeds()
    reader.enable_search()
    reader.update_search()

    # sanity check, one is more relevant
    assert [e.id for e in reader.search_entries('word')] == ['1, 1', '1, 2']

    results = list(reader.search_entries('word', sort='recent'))
    # two is first because of updated
    assert [e.id for e in results] == ['1, 2', '1, 1']
    # but within 1, the content keys are sorted by relevance;
    assert list(results[1].content) == [
        '.content[2].value',
        '.content[0].value',
        '.content[1].value',
    ]
Exemplo n.º 7
0
def test_search_entries_order_title_content_beats_title(reader):
    parser = Parser()
    reader._parser = parser

    feed = parser.feed(1, datetime(2010, 1, 1))
    one = parser.entry(1, 1, datetime(2010, 1, 1), title='one')
    two = parser.entry(1, 2, datetime(2010, 1, 1), title='two')
    three = parser.entry(1,
                         3,
                         datetime(2010, 1, 1),
                         title='one',
                         content=[Content('one')])

    reader.add_feed(feed.url)
    reader.update_feeds()
    reader.enable_search()
    reader.update_search()

    assert [(e.id, e.feed_url) for e in reader.search_entries('one')] == [
        (three.id, feed.url),
        (one.id, feed.url),
    ]
Exemplo n.º 8
0
import datetime

from reader import Content
from reader import Enclosure
from reader._types import EntryData
from reader._types import FeedData

feed = FeedData(url='{}relative.rss'.format(url_base),
                link='{}file.html'.format(rel_base))

entries = [
    EntryData(
        feed_url=feed.url,
        id='7bd204c6-1655-4c27-aeee-53f933c5395f',
        updated=None,
        link='{}blog/post/1'.format(rel_base),
        summary='one <a href="{}target">two</a> three'.format(rel_base),
        content=(
            Content(value='<script>evil</script> content',
                    type='text/plain',
                    language=None),
            Content(value='content', type='text/html', language=None),
        ),
        enclosures=(
            # for RSS feedparser doesn't make relative links absolute
            # (it does for Atom)
            Enclosure(href='enclosure?q=a#fragment'), ),
    )
]
Exemplo n.º 9
0
def make_entry(title=None, summary=None, content=None):
    entry = Entry('id', None, title=title, summary=summary)
    if content:
        entry = entry._replace(content=[Content(*content)])
    return entry
Exemplo n.º 10
0
    author='John Doe',
)

entries = [
    EntryData(
        feed_url=feed.url,
        id='urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a',
        updated=datetime.datetime(2003, 12, 13, 18, 30, 2),
        title='Atom-Powered Robots Run Amok',
        link='http://example.org/2003/12/13/atom03',
        author='John Doe',
        published=datetime.datetime(2003, 12, 13, 17, 17, 51),
        summary='Some text.',
        content=(
            # the text/plain type comes from feedparser
            Content(value='content', type='text/plain'),
            Content(value='content with type', type='text/whatever'),
            Content(value='content with lang',
                    type='text/plain',
                    language='en'),
        ),
        enclosures=(
            # the text/html type comes from feedparser
            Enclosure(href='http://example.org/enclosure', type='text/html'),
            Enclosure(href='http://example.org/enclosure-with-type',
                      type='text/whatever'),
            Enclosure(
                href='http://example.org/enclosure-with-length',
                type='text/html',
                length=1000,
            ),
Exemplo n.º 11
0
import datetime

from reader import Content
from reader import Enclosure
from reader._types import EntryData
from reader._types import FeedData


feed = FeedData(
    url='{}empty.json'.format(url_base),
)

entries = [
    EntryData(
        feed_url=feed.url,
        id='1',
        updated=None,
        content=(
            Content(
                value='content',
                type='text/plain',
            ),
        ),
    ),
]
Exemplo n.º 12
0
def test_search_entries_basic(reader, sort):
    parser = Parser()
    reader._parser = parser

    feed = parser.feed(1, datetime(2010, 1, 1))
    one = parser.entry(1, 1, datetime(2010, 1, 1), title='one')
    two = parser.entry(1,
                       2,
                       datetime(2010, 1, 1),
                       title='two',
                       summary='summary')
    three = parser.entry(
        1,
        3,
        datetime(2010, 1, 1),
        title='shall not be named',
        summary='does not match',
        # The emoji is to catch a bug in the json_extract() SQLite function.
        # As of reader 1.4 we're not using it anymore, and the workaround
        # was removed; we keep the emoji in case of regressions.
        # Bug: https://bugs.python.org/issue38749
        # Workaround and more details: https://github.com/lemon24/reader/blob/d4363f683fc18ca12f597809ceca4e7dbd0a303a/src/reader/_sqlite_utils.py#L332
        content=[Content('three 🤩 content')],
    )

    reader.add_feed(feed.url)
    reader.update_feeds()

    reader.enable_search()

    assert list(reader.search_entries('one')) == []

    reader.update_search()

    search = lambda *a, **kw: reader.search_entries(*a, sort=sort, **kw)
    search_counts = lambda *a, **kw: reader.search_entry_counts(*a, **kw)

    # TODO: the asserts below look parametrizable

    assert list(search('zero')) == []
    assert search_counts('zero') == EntrySearchCounts(0, 0, 0, 0)
    assert list(search('one')) == [
        EntrySearchResult(
            feed.url,
            one.id,
            {
                '.title': HighlightedString(one.title, (slice(0, 3), )),
                '.feed.title': HighlightedString(feed.title),
            },
        )
    ]
    assert search_counts('one') == EntrySearchCounts(1, 0, 0, 0)
    assert list(search('two')) == [
        EntrySearchResult(
            feed.url,
            two.id,
            {
                '.title': HighlightedString(two.title, (slice(0, 3), )),
                '.feed.title': HighlightedString(feed.title),
            },
            {'.summary': HighlightedString('summary')},
        )
    ]
    assert list(search('three')) == [
        EntrySearchResult(
            feed.url,
            three.id,
            {
                '.title': HighlightedString(three.title),
                '.feed.title': HighlightedString(feed.title),
            },
            {
                '.content[0].value':
                HighlightedString(three.content[0].value, (slice(0, 5), ))
            },
        )
    ]

    # TODO: fix inconsistent naming

    feed_two = parser.feed(2, datetime(2010, 1, 2))
    feed_two_entry = parser.entry(2, 1, datetime(2010, 1, 2), title=None)
    feed_three = parser.feed(3, datetime(2010, 1, 1), title=None)
    feed_three_entry = parser.entry(3,
                                    1,
                                    datetime(2010, 1, 1),
                                    title='entry summary')

    reader.add_feed(feed_two.url)
    reader.add_feed(feed_three)
    reader.set_feed_user_title(feed_two, 'a summary of things')

    reader.update_feeds()
    feed_two_entry = reader.get_entry((feed_two.url, feed_two_entry.id))

    reader.update_search()

    # We can't use a set here because the dicts in EntrySearchResult aren't hashable.
    assert {
        (e.feed_url, e.id): e
        for e in search('summary')
    } == {
        (e.feed_url, e.id): e
        for e in [
            EntrySearchResult(
                feed_three.url,
                feed_three_entry.id,
                {
                    '.title':
                    HighlightedString(feed_three_entry.title, (slice(6, 13), ))
                },
            ),
            EntrySearchResult(
                feed_two.url,
                feed_two_entry.id,
                {
                    '.feed.user_title':
                    HighlightedString(feed_two_entry.feed.user_title, (
                        slice(2, 9), ))
                },
            ),
            EntrySearchResult(
                feed.url,
                two.id,
                {
                    '.title': HighlightedString(two.title),
                    '.feed.title': HighlightedString(feed.title),
                },
                {'.summary': HighlightedString(two.summary, (slice(0, 7), ))},
            ),
        ]
    }
    assert search_counts('summary') == EntrySearchCounts(3, 0, 0, 0)
Exemplo n.º 13
0
def test_update_triggers_no_change(db_path, monkeypatch, set_user_title):
    """update_search() should *not* update the search index
    if anything else except the indexed fields changes.

    """
    from reader._search import Search

    strip_html_called = 0

    class MySearch(Search):
        @staticmethod
        def strip_html(*args, **kwargs):
            nonlocal strip_html_called
            strip_html_called += 1
            return Search.strip_html(*args, **kwargs)

    # TODO: remove monkeypatching when make_reader() gets a search_cls argument
    monkeypatch.setattr('reader.core.Search', MySearch)

    reader = make_reader(db_path)
    reader._parser = parser = Parser()

    reader._parser = parser = Parser()
    feed = parser.feed(1, datetime(2010, 1, 1), title='feed')
    entry = parser.entry(
        1,
        1,
        datetime(2010, 1, 1),
        title='entry',
        summary='summary',
        content=[Content('content')],
    )

    reader.add_feed(feed.url)
    reader.update_feeds()
    if set_user_title:
        reader.set_feed_user_title(feed, 'user title')

    reader.enable_search()
    reader.update_search()

    assert strip_html_called > 0
    strip_html_called = 0

    (old_result, ) = reader.search_entries('entry OR feed')

    feed = parser.feed(1,
                       datetime(2010, 1, 2),
                       title='feed',
                       link='link',
                       author='author')
    """
    entry = parser.entry(
        1, 1, datetime(2010, 1, 2),
        title='entry',
        summary='summary',
        content=[Content('content')],
        link='link', author='author',
        published=datetime(2010, 1, 2),
        enclosures=[Enclosure('enclosure')],
    )
    """
    # NOTE: As of 1.4, updating entries normally (above) uses INSERT OR REPLACE.
    # REPLACE == DELETE + INSERT (https://www.sqlite.org/lang_conflict.html),
    # so updating the entry normally *will not* fire the ON UPDATE trigger,
    # but the ON DELETE and ON INSERT ones (basically, the ON UPDATE trigger
    # never fires at the moment).
    #
    # Meanwhile, we do a (more intrusive/brittle) manual update:
    with reader._search.db as db:
        db.execute("""
            UPDATE entries
            SET (
                title,
                link,
                updated,
                author,
                published,
                summary,
                content,
                enclosures
            ) = (
                'entry',
                'http://www.example.com/entries/1',
                '2010-01-02 00:00:00',
                'author',
                '2010-01-02 00:00:00',
                'summary',
                '[{"value": "content", "type": null, "language": null}]',
                '[{"href": "enclosure", "type": null, "length": null}]'
            )
            WHERE (id, feed) = ('1, 1', '1');
            """)
    # TODO: Change this test when updating entries uses UPDATE instead of INSERT OR REPLACE

    reader.mark_as_read(entry)
    reader.mark_as_important(entry)

    reader.update_feeds()
    if set_user_title:
        reader.set_feed_user_title(feed, 'user title')

    reader.update_search()

    (new_result, ) = reader.search_entries('entry OR feed')

    assert old_result == new_result
    assert strip_html_called == 0
Exemplo n.º 14
0
         ['.title', '.feed.title', '.summary'],
     ),
     (
         lambda r: r._parser.entry(
             1, 1, datetime(2010, 1, 3), summary='new'),
         ['.title', '.feed.title', '.summary'],
     ),
 ],
 "after update on entries: content": [
     (
         lambda r: r._parser.entry(1, 1, datetime(2010, 1, 1)),
         ['.title', '.feed.title'],
     ),
     (
         lambda r: r._parser.entry(
             1, 1, datetime(2010, 1, 2), content=[Content('old')]),
         ['.title', '.feed.title', '.content[0].value'],
     ),
     (
         lambda r: r._parser.entry(
             1, 1, datetime(2010, 1, 3), content=[Content('new')]),
         ['.title', '.feed.title', '.content[0].value'],
     ),
     (
         lambda r: r._parser.entry(
             1,
             1,
             datetime(2010, 1, 4),
             content=[Content('new'),
                      Content('another one')],
         ),
Exemplo n.º 15
0
    author='Example editor ([email protected])',
)

entries = [
    EntryData(
        feed_url=feed.url,
        id='7bd204c6-1655-4c27-aeee-53f933c5395f',
        updated=datetime.datetime(2009, 9, 6, 16, 20),
        title='Example entry',
        link='http://www.example.com/blog/post/1',
        author='Example editor',
        published=None,
        summary='Here is some text containing an interesting description.',
        content=(
            # the text/plain type comes from feedparser
            Content(value='Example content', type='text/plain'),
        ),
        enclosures=(
            Enclosure(href='http://example.com/enclosure'),
            Enclosure(href='http://example.com/enclosure-with-type', type='image/jpeg'),
            Enclosure(href='http://example.com/enclosure-with-length', length=100000),
            Enclosure(href='http://example.com/enclosure-with-bad-length'),
        ),
    ),
    EntryData(
        feed_url=feed.url,
        id='00000000-1655-4c27-aeee-00000000',
        updated=datetime.datetime(2009, 9, 6, 0, 0, 0),
        title='Example entry, again',
    ),
]