示例#1
0
def test_delete_nonexistant():
    from whoosh.writing import IndexingError

    schema = fields.Schema(id=fields.ID(stored=True))
    # Single segment
    with TempIndex(schema, "deletenon1") as ix:
        w = ix.writer()
        for char in u("ABC"):
            w.add_document(id=char)
        w.commit()

        try:
            w = ix.writer()
            assert_raises(IndexingError, w.delete_document, 5)
        finally:
            w.cancel()

    # Multiple segments
    with TempIndex(schema, "deletenon1") as ix:
        for char in u("ABC"):
            w = ix.writer()
            w.add_document(id=char)
            w.commit(merge=False)

        try:
            w = ix.writer()
            assert_raises(IndexingError, w.delete_document, 5)
        finally:
            w.cancel()
示例#2
0
def test_reindex():
    SAMPLE_DOCS = [
        {
            'id': u('test1'),
            'text': u('This is a document. Awesome, is it not?')
        },
        {
            'id': u('test2'),
            'text': u('Another document. Astounding!')
        },
        {
            'id':
            u('test3'),
            'text':
            u('A fascinating article on the behavior of domestic steak knives.'
              )
        },
    ]

    schema = fields.Schema(text=fields.TEXT(stored=True),
                           id=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "reindex") as ix:

        def reindex():
            writer = ix.writer()
            for doc in SAMPLE_DOCS:
                writer.update_document(**doc)
            writer.commit()

        reindex()
        assert_equal(ix.doc_count_all(), 3)
        reindex()
        assert_equal(ix.doc_count_all(), 3)
示例#3
0
def test_update():
    # Test update with multiple unique keys
    SAMPLE_DOCS = [
        {
            "id": u("test1"),
            "path": u("/test/1"),
            "text": u("Hello")
        },
        {
            "id": u("test2"),
            "path": u("/test/2"),
            "text": u("There")
        },
        {
            "id": u("test3"),
            "path": u("/test/3"),
            "text": u("Reader")
        },
    ]

    schema = fields.Schema(id=fields.ID(unique=True, stored=True),
                           path=fields.ID(unique=True, stored=True),
                           text=fields.TEXT)

    with TempIndex(schema, "update") as ix:
        writer = ix.writer()
        for doc in SAMPLE_DOCS:
            writer.add_document(**doc)
        writer.commit()

        writer = ix.writer()
        writer.update_document(id=u("test2"),
                               path=u("test/1"),
                               text=u("Replacement"))
        writer.commit()
示例#4
0
def test_noscorables1():
    values = [
        u("alfa"),
        u("bravo"),
        u("charlie"),
        u("delta"),
        u("echo"),
        u("foxtrot"),
        u("golf"),
        u("hotel"),
        u("india"),
        u("juliet"),
        u("kilo"),
        u("lima")
    ]
    from random import choice, sample, randint

    times = 1000

    schema = fields.Schema(id=fields.ID, tags=fields.KEYWORD)
    with TempIndex(schema, "noscorables1") as ix:
        w = ix.writer()
        for _ in xrange(times):
            w.add_document(id=choice(values),
                           tags=u(" ").join(sample(values, randint(2, 7))))
        w.commit()

        with ix.searcher() as s:
            s.search(query.Term("id", "bravo"))
示例#5
0
def test_removefield():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.TEXT,
                           city=fields.KEYWORD(stored=True))
    with TempIndex(schema, "removefield") as ix:
        w = ix.writer()
        w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad"))
        w.add_document(id=u("c"), content=u("charlie"), city=u("cairo"))
        w.add_document(id=u("d"), content=u("delta"), city=u("dakar"))
        w.commit()
        
        with ix.searcher() as s:
            assert_equal(s.document(id=u("c")), {"id": "c", "city": "cairo"})
        
        w = ix.writer()
        w.remove_field("content")
        w.remove_field("city")
        w.commit()

        ixschema = ix._current_schema()
        assert_equal(ixschema.names(), ["id"])
        assert_equal(ixschema.stored_names(), ["id"])
        
        with ix.searcher() as s:
            assert ("content", u("charlie")) not in s.reader()
            assert_equal(s.document(id=u("c")), {"id": u("c")})
示例#6
0
def test_suggest_prefix():
    domain = ("Shoot To Kill", "Bloom, Split and Deviate",
              "Rankle the Seas and the Skies", "Lightning Flash Flame Shell",
              "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and "
              "Heavenly Demon Sneer",
              "All Waves, Rise now and Become my Shield, Lightning, Strike "
              "now and Become my Blade",
              "Cry, Raise Your Head, Rain Without end",
              "Sting All Enemies To Death", "Reduce All Creation to Ash",
              "Sit Upon the Frozen Heavens", "Call forth the Twilight")

    schema = fields.Schema(content=fields.TEXT(stored=True, spelling=True),
                           quick=fields.NGRAM(maxsize=10, stored=True))

    with TempIndex(schema, "sugprefix") as ix:
        with ix.writer() as w:
            for item in domain:
                content = u(item)
                w.add_document(content=content, quick=content)

        with ix.searcher() as s:
            sugs = s.suggest("content", u("ra"), maxdist=2, prefix=2)
            assert_equal(sugs, ['rage', 'rain'])

            sugs = s.suggest("content", "ra", maxdist=2, prefix=1)
            assert_equal(sugs, ["rage", "rain", "roar"])
示例#7
0
def test_buffered_update():
    schema = fields.Schema(id=fields.ID(stored=True, unique=True),
                           payload=fields.STORED)
    with TempIndex(schema, "bufferedupdate") as ix:
        w = writing.BufferedWriter(ix, period=None, limit=5)
        for i in xrange(10):
            for char in u("abc"):
                fs = dict(id=char, payload=text_type(i) + char)
                w.update_document(**fs)

        with w.reader() as r:
            assert_equal(sorted(r.all_stored_fields(), key=lambda x: x["id"]),
                         [{
                             'id': u('a'),
                             'payload': u('9a')
                         }, {
                             'id': u('b'),
                             'payload': u('9b')
                         }, {
                             'id': u('c'),
                             'payload': u('9c')
                         }])
            assert_equal(r.doc_count(), 3)

        w.close()
示例#8
0
def test_sort_filter():
    schema = fields.Schema(group=fields.ID(stored=True),
                           key=fields.ID(stored=True))
    groups = u("alfa bravo charlie").split()
    keys = u("abcdefghijklmnopqrstuvwxyz")
    source = []
    for i in xrange(100):
        key = keys[i % len(keys)]
        group = groups[i % len(groups)]
        source.append({"key": key, "group": group})
    source.sort(key=lambda x: (x["key"], x["group"]))

    sample = list(source)
    random.shuffle(sample)

    with TempIndex(schema, "sortfilter") as ix:
        w = ix.writer()
        for i, fs in enumerate(sample):
            w.add_document(**fs)
            i += 1
            if not i % 26:
                w.commit(merge=False)
                w = ix.writer()
        w.commit()

        fq = query.Term("group", u("bravo"))

        with ix.searcher() as s:
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=20)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"][:20])

            fq = query.Term("group", u("bravo"))
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=None)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"])

        ix.optimize()

        with ix.searcher() as s:
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=20)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"][:20])

            fq = query.Term("group", u("bravo"))
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=None)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"])
示例#9
0
def test_deleteall():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "deleteall") as ix:
        w = ix.writer()
        domain = u("alfa bravo charlie delta echo").split()
        for i, ls in enumerate(permutations(domain)):
            w.add_document(text=u(" ").join(ls))
            if not i % 10:
                w.commit()
                w = ix.writer()
        w.commit()

        # This is just a test, don't use this method to delete all docs IRL!
        doccount = ix.doc_count_all()
        w = ix.writer()
        for docnum in xrange(doccount):
            w.delete_document(docnum)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Or([query.Term("text", u("alfa")), query.Term("text", u("bravo"))]))
            assert_equal(len(r), 0)

        ix.optimize()
        assert_equal(ix.doc_count_all(), 0)

        with ix.reader() as r:
            assert_equal(list(r), [])
示例#10
0
def test_buffered_threads():
    class SimWriter(threading.Thread):
        def __init__(self, w, domain):
            threading.Thread.__init__(self)
            self.w = w
            self.domain = domain

        def run(self):
            w = self.w
            domain = self.domain
            for _ in xrange(10):
                w.update_document(name=random.choice(domain))
                time.sleep(random.uniform(0.01, 0.1))

    schema = fields.Schema(name=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "buffthreads") as ix:
        domain = u("alfa bravo charlie delta").split()
        w = writing.BufferedWriter(ix, limit=10)
        threads = [SimWriter(w, domain) for _ in xrange(10)]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        w.close()

        with ix.reader() as r:
            assert_equal(r.doc_count(), 4)
            assert_equal(sorted([d["name"] for d in r.all_stored_fields()]),
                         domain)
示例#11
0
def test_asyncwriter_no_stored():
    schema = fields.Schema(id=fields.ID, text=fields.TEXT)
    with TempIndex(schema, "asyncnostored") as ix:
        domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
                  u("foxtrot"), u("golf"), u("hotel"), u("india"))

        writers = []
        # Simulate doing 20 (near-)simultaneous commits. If we weren't using
        # AsyncWriter, at least some of these would fail because the first
        # writer wouldn't be finished yet.
        for i in xrange(20):
            w = writing.AsyncWriter(ix)
            writers.append(w)
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
            w.commit()

        # Wait for all writers to finish before checking the results
        for w in writers:
            if w.running:
                w.join()

        # Check whether all documents made it into the index.
        with ix.reader() as r:
            assert_equal(sorted([int(id) for id in r.lexicon("id")]),
                         list(range(20)))
示例#12
0
def test_vector_merge():
    schema = fields.Schema(title=fields.TEXT,
                           content=fields.TEXT(vector=formats.Frequency()))

    with TempIndex(schema, "vectormerge") as ix:
        writer = ix.writer()
        writer.add_document(title=u("one"),
                            content=u("This is the story of the black hole " +
                                      "story"))
        writer.commit()

        writer = ix.writer()
        writer.add_document(title=u("two"),
                            content=u("You can read along in your book"))
        writer.commit()

        with ix.searcher() as s:
            r = s.reader()

            docnum = s.document_number(title=u("one"))
            vec = list(r.vector_as("frequency", docnum, "content"))
            assert_equal(vec,
                         [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])

            docnum = s.document_number(title=u("two"))

            vec = list(r.vector_as("frequency", docnum, "content"))
            assert_equal(vec,
                         [(u('along'), 1), (u('book'), 1), (u('read'), 1)])
示例#13
0
def test_todisk():
    ix = make_index()

    with TempIndex(ix.schema, "ramtodisk") as fix:
        w = fix.writer()
        w.add_reader(ix.reader())
        w.commit()
示例#14
0
def test_page_sorted():
    schema = fields.Schema(key=fields.ID(stored=True))
    with TempIndex(schema, "pagesorted") as ix:
        domain = list(u("abcdefghijklmnopqrstuvwxyz"))
        random.shuffle(domain)

        w = ix.writer()
        for char in domain:
            w.add_document(key=char)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Every(), sortedby="key", limit=5)
            assert_equal(r.scored_length(), 5)
            assert_equal(len(r), s.doc_count_all())

            rp = s.search_page(query.Every(), 1, pagelen=5, sortedby="key")
            assert_equal("".join([h["key"] for h in rp]), "abcde")
            assert_equal(rp[10:], [])

            rp = s.search_page(query.Term("key", "glonk"),
                               1,
                               pagelen=5,
                               sortedby="key")
            assert_equal(len(rp), 0)
            assert rp.is_last_page()
示例#15
0
def test_multifacet():
    schema = fields.Schema(tag=fields.ID(stored=True),
                           size=fields.ID(stored=True))
    with TempIndex(schema, "multifacet") as ix:
        w = ix.writer()
        w.add_document(tag=u("alfa"), size=u("small"))
        w.add_document(tag=u("bravo"), size=u("medium"))
        w.add_document(tag=u("alfa"), size=u("large"))
        w.add_document(tag=u("bravo"), size=u("small"))
        w.add_document(tag=u("alfa"), size=u("medium"))
        w.add_document(tag=u("bravo"), size=u("medium"))
        w.commit()

        correct = {
            (u('bravo'), u('medium')): [1, 5],
            (u('alfa'), u('large')): [2],
            (u('alfa'), u('medium')): [4],
            (u('alfa'), u('small')): [0],
            (u('bravo'), u('small')): [3]
        }

        with ix.searcher() as s:
            facet = sorting.MultiFacet(["tag", "size"])
            r = s.search(query.Every(), groupedby={"tag/size": facet})
            cats = r.groups(("tag/size"))
            assert_equal(cats, correct)
示例#16
0
def test_no_add():
    from whoosh.filedb.multiproc import MpWriter

    schema = fields.Schema(
        text=fields.TEXT(stored=True, spelling=True, vector=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=3) as w:
            assert_equal(type(w), MpWriter)
示例#17
0
def test_batchsize_eq_doccount():
    from whoosh.filedb.multiproc import MpWriter

    schema = fields.Schema(a=fields.KEYWORD(stored=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=4, batchsize=10) as w:
            for i in xrange(10):
                w.add_document(a=u(str(i)))
示例#18
0
def test_delete_recovery():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "delrecover") as ix:
        rw = RecoverWriter(ix)
        rr = RecoverReader(ix)
        rw.start()
        rr.start()
        rw.join()
        rr.join()
示例#19
0
 def check(method):
     with TempIndex(get_schema()) as ix:
         method(ix)
         with ix.searcher() as s:
             results = s.search(query.Every(), groupedby="tag")
             groups = results.groups("tag")
             assert (sorted(groups.items()) == [(u('one'), [0, 6]),
                                                (u('three'), [1, 3, 7, 8]),
                                                (u('two'), [2, 4, 5])])
示例#20
0
def test_empty_commit():
    s = fields.Schema(id=fields.ID(stored=True))
    with TempIndex(s, "emptycommit") as ix:
        w = ix.writer()
        w.add_document(id=u("1"))
        w.add_document(id=u("2"))
        w.add_document(id=u("3"))
        w.commit()

        w = ix.writer()
        w.commit()
示例#21
0
def test_cancel_delete():
    schema = fields.Schema(id=fields.ID(stored=True))
    # Single segment
    with TempIndex(schema, "canceldelete1") as ix:
        w = ix.writer()
        for char in u("ABCD"):
            w.add_document(id=char)
        w.commit()

        with ix.reader() as r:
            assert not r.has_deletions()

        w = ix.writer()
        w.delete_document(2)
        w.delete_document(3)
        w.cancel()

        with ix.reader() as r:
            assert not r.has_deletions()
            assert not r.is_deleted(2)
            assert not r.is_deleted(3)

    # Multiple segments
    with TempIndex(schema, "canceldelete2") as ix:
        for char in u("ABCD"):
            w = ix.writer()
            w.add_document(id=char)
            w.commit(merge=False)

        with ix.reader() as r:
            assert not r.has_deletions()

        w = ix.writer()
        w.delete_document(2)
        w.delete_document(3)
        w.cancel()

        with ix.reader() as r:
            assert not r.has_deletions()
            assert not r.is_deleted(2)
            assert not r.is_deleted(3)
示例#22
0
def test_empty_field():
    schema = fields.Schema(id=fields.STORED, key=fields.KEYWORD)
    with TempIndex(schema, "emptysort") as ix:
        w = ix.writer()
        w.add_document(id=1)
        w.add_document(id=2)
        w.add_document(id=3)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Every(), sortedby="key")
            assert_equal([h["id"] for h in r], [1, 2, 3])
示例#23
0
def test_single():
    schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(schema, "single") as ix:
        w = ix.writer()
        w.add_document(id=u("1"), text=u("alfa"))
        w.commit()

        with ix.searcher() as s:
            assert ("text", u("alfa")) in s.reader()
            assert_equal(list(s.documents(id="1")), [{"id": "1"}])
            assert_equal(list(s.documents(text="alfa")), [{"id": "1"}])
            assert_equal(list(s.all_stored_fields()), [{"id": "1"}])
def test_many_updates():
    schema = fields.Schema(key=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "manyupdates") as ix:
        for _ in xrange(10000):
            num = random.randint(0, 5000)
            w = ix.writer()
            w.update_document(key=text_type(num))
            w.commit()

        with ix.searcher() as s:
            result = [d["key"] for d in s.search(query.Every())]
            assert_equal(len(result), len(set(result)))
示例#25
0
def test_vector_reading():
    schema = fields.Schema(title=fields.TEXT,
                           content=fields.TEXT(vector=formats.Frequency()))

    with TempIndex(schema, "vectorreading") as ix:
        writer = ix.writer()
        writer.add_document(title=u("one"),
                            content=u("This is the story of the black " +
                                      "hole story"))
        writer.commit()

        with ix.reader() as r:
            assert_equal(list(r.vector_as("frequency", 0, "content")),
                         [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
示例#26
0
def test_cached_lexicon():
    schema = fields.Schema(tag=fields.ID)
    with TempIndex(schema, "cachedlexicon") as ix:
        w = ix.writer()
        w.add_document(tag=u("sierra"))
        w.add_document(tag=u("alfa"))
        w.add_document(tag=u("juliet"))
        w.add_document(tag=u("romeo"))
        w.commit()

        with ix.reader() as r:
            _ = r.fieldcache("tag")
            assert_equal(list(r.lexicon("tag")),
                         ["alfa", "juliet", "romeo", "sierra"])
示例#27
0
def test_prefix_address():
    fieldtype = fields.TEXT(spelling=True)
    schema = fields.Schema(f1=fieldtype, f2=fieldtype)
    with TempIndex(schema, "prefixaddr") as ix:
        with ix.writer() as w:
            w.add_document(f1=u("aabc aawx aaqr aade"),
                           f2=u("aa12 aa34 aa56 aa78"))

        with ix.searcher() as s:
            sugs = s.suggest("f1", u("aa"), maxdist=2, prefix=2)
            assert_equal(sorted(sugs), ["aabc", "aade", "aaqr", "aawx"])

            sugs = s.suggest("f2", u("aa"), maxdist=2, prefix=2)
            assert_equal(sorted(sugs), ["aa12", "aa34", "aa56", "aa78"])
示例#28
0
def test_deletion():
    s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT)
    with TempIndex(s, "deletion") as ix:
        w = ix.writer()
        w.add_document(key=u("A"),
                       name=u("Yellow brown"),
                       value=u("Blue red green purple?"))
        w.add_document(key=u("B"),
                       name=u("Alpha beta"),
                       value=u("Gamma delta epsilon omega."))
        w.add_document(key=u("C"),
                       name=u("One two"),
                       value=u("Three four five."))
        w.commit()

        w = ix.writer()
        count = w.delete_by_term("key", u("B"))
        assert_equal(count, 1)
        w.commit(merge=False)

        assert_equal(ix.doc_count_all(), 3)
        assert_equal(ix.doc_count(), 2)

        w = ix.writer()
        w.add_document(key=u("A"),
                       name=u("Yellow brown"),
                       value=u("Blue red green purple?"))
        w.add_document(key=u("B"),
                       name=u("Alpha beta"),
                       value=u("Gamma delta epsilon omega."))
        w.add_document(key=u("C"),
                       name=u("One two"),
                       value=u("Three four five."))
        w.commit()

        # This will match both documents with key == B, one of which is already
        # deleted. This should not raise an error.
        w = ix.writer()
        count = w.delete_by_term("key", u("B"))
        assert_equal(count, 1)
        w.commit()

        ix.optimize()
        assert_equal(ix.doc_count_all(), 4)
        assert_equal(ix.doc_count(), 4)

        with ix.reader() as tr:
            assert_equal(list(tr.lexicon("name")),
                         ["brown", "one", "two", "yellow"])
示例#29
0
def test_add_field():
    schema = fields.Schema(a=fields.TEXT)
    with TempIndex(schema, "addfield") as ix:
        with ix.writer() as w:
            w.add_document(a=u("alfa bravo charlie"))
        with ix.writer() as w:
            w.add_field("b", fields.ID(stored=True))
            w.add_field("c*", fields.ID(stored=True), glob=True)
            w.add_document(a=u("delta echo foxtrot"),
                           b=u("india"),
                           cat=u("juliet"))

        with ix.searcher() as s:
            fs = s.document(b=u("india"))
            assert_equal(fs, {"b": "india", "cat": "juliet"})
示例#30
0
def test_no_stored():
    schema = fields.Schema(id=fields.ID, text=fields.TEXT)
    with TempIndex(schema, "nostored") as ix:
        domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
                  u("foxtrot"), u("golf"), u("hotel"), u("india"))

        w = ix.writer()
        for i in xrange(20):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.commit()

        with ix.reader() as r:
            assert_equal(sorted([int(id) for id in r.lexicon("id")]),
                         list(range(20)))