示例#1
0
def test_buffered_update():
    schema = fields.Schema(id=fields.ID(stored=True, unique=True),
                           payload=fields.STORED)
    with TempIndex(schema, "bufferedupdate") as ix:
        w = writing.BufferedWriter(ix, period=None, limit=5)
        for i in xrange(10):
            for char in u("abc"):
                fs = dict(id=char, payload=text_type(i) + char)
                w.update_document(**fs)

        with w.reader() as r:
            assert_equal(sorted(r.all_stored_fields(), key=lambda x: x["id"]),
                         [{
                             'id': u('a'),
                             'payload': u('9a')
                         }, {
                             'id': u('b'),
                             'payload': u('9b')
                         }, {
                             'id': u('c'),
                             'payload': u('9c')
                         }])
            assert_equal(r.doc_count(), 3)

        w.close()
示例#2
0
def test_buffered_threads():
    class SimWriter(threading.Thread):
        def __init__(self, w, domain):
            threading.Thread.__init__(self)
            self.w = w
            self.domain = domain

        def run(self):
            w = self.w
            domain = self.domain
            for _ in xrange(10):
                w.update_document(name=random.choice(domain))
                time.sleep(random.uniform(0.01, 0.1))

    schema = fields.Schema(name=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "buffthreads") as ix:
        domain = u("alfa bravo charlie delta").split()
        w = writing.BufferedWriter(ix, limit=10)
        threads = [SimWriter(w, domain) for _ in xrange(10)]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        w.close()

        with ix.reader() as r:
            assert_equal(r.doc_count(), 4)
            assert_equal(sorted([d["name"] for d in r.all_stored_fields()]),
                         domain)
示例#3
0
def test_buffered_update():
    schema = fields.Schema(id=fields.ID(stored=True, unique=True),
                           payload=fields.STORED)
    with TempIndex(schema, "bufferedupdate") as ix:
        w = writing.BufferedWriter(ix, period=None, limit=5)
        for i in xrange(10):
            for char in u"abc":
                fs = dict(id=char, payload=text_type(i) + char)
                w.update_document(**fs)

        with w.reader() as r:
            sfs = [sf for _, sf in r.iter_docs()]
            sfs = sorted(sfs, key=lambda x: x["id"])
            assert sfs == [{
                'id': u('a'),
                'payload': u('9a')
            }, {
                'id': u('b'),
                'payload': u('9b')
            }, {
                'id': u('c'),
                'payload': u('9c')
            }]
            assert r.doc_count() == 3

        w.close()
示例#4
0
def test_buffered():
    schema = fields.Schema(id=fields.ID, text=fields.TEXT)
    with TempIndex(schema, "buffered") as ix:
        domain = u"alfa bravo charlie delta echo foxtrot golf hotel india"
        domain = domain.split()

        w = writing.BufferedWriter(ix, period=None, limit=10,
                                   commitargs={"merge": False})
        for i in xrange(20):
            w.add_document(id=text_type(i),
                           text=u" ".join(random.sample(domain, 5)))
        time.sleep(0.1)
        w.close()

        assert len(ix._segments()) == 2
示例#5
0
def test_buffered():
    schema = fields.Schema(id=fields.ID, text=fields.TEXT)
    with TempIndex(schema, "buffered") as ix:
        domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
                  u("foxtrot"), u("golf"), u("hotel"), u("india"))

        w = writing.BufferedWriter(ix,
                                   period=None,
                                   limit=10,
                                   commitargs={"merge": False})
        for i in xrange(100):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        time.sleep(0.5)
        w.close()

        assert_equal(len(ix._segments()), 10)
示例#6
0
    def fill_index(self, df):
        ii = 0
        with writing.BufferedWriter(self.ix, period=20, limit=1000) as writer:
            for index, row in df.iterrows():
                row_dict = row.to_dict()
                #row_dict.update({'body_processed':row['body']})
                try:
                    writer.add_document(**row_dict)
                except:
                    print("Couldn't index document in Whoosh", index,
                          len(row['body']), row['body'])
                    ii += 1
                if index % 10000 == 0:
                    print("Went through {} document(s)".format(index + 1))

        print(
            '{} documents could not be indexed out of {}. Not an issue if small %.'
            .format(ii, len(df)))
        self.load_to_pandas()
示例#7
0
def test_buffered_search():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    with TempIndex(schema, "bufferedsearch") as ix:
        w = writing.BufferedWriter(ix, period=None, limit=5)
        w.add_document(id=1, text=u("alfa bravo charlie"))
        w.add_document(id=2, text=u("bravo tango delta"))
        w.add_document(id=3, text=u("tango delta echo"))
        w.add_document(id=4, text=u("charlie delta echo"))

        with w.searcher() as s:
            r = s.search(query.Term("text", u("tango")))
            assert_equal(sorted([d["id"] for d in r]), [2, 3])

        w.add_document(id=5, text=u("foxtrot golf hotel"))
        w.add_document(id=6, text=u("india tango juliet"))
        w.add_document(id=7, text=u("tango kilo lima"))
        w.add_document(id=8, text=u("mike november echo"))

        with w.searcher() as s:
            r = s.search(query.Term("text", u("tango")))
            assert_equal(sorted([d["id"] for d in r]), [2, 3, 6, 7])

        w.close()
示例#8
0
def test_buffered_threads():
    domain = u"alfa bravo charlie delta".split()
    schema = fields.Schema(name=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "buffthreads") as ix:
        w = writing.BufferedWriter(ix, limit=10)

        class SimWriter(threading.Thread):
            def run(self):
                for _ in xrange(5):
                    w.update_document(name=random.choice(domain))
                    time.sleep(random.uniform(0.01, 0.1))

        threads = [SimWriter() for _ in xrange(5)]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        w.close()

        with ix.reader() as r:
            assert r.doc_count() == 4
            names = sorted([d["name"] for d in r.all_stored_fields()])
            assert names == domain