def test_ordered_closest(): keys = [ 'alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november' ] # Make into bytes for Python 3 keys = [b(k) for k in keys] values = [str(len(k)).encode("ascii") for k in keys] with TempStorage("orderedclosest") as st: hw = OrderedHashWriter(st.create_file("test.hsh")) hw.add_all(zip(keys, values)) hw.close() hr = OrderedHashReader.open(st, "test.hsh") ck = hr.closest_key assert ck(b('')) == b('alfa') assert ck(b(' ')) == b('alfa') assert ck(b('alfa')) == b('alfa') assert ck(b('bravot')) == b('charlie') assert ck(b('charlie')) == b('charlie') assert ck(b('kiloton')) == b('lima') assert ck(b('oskar')) is None assert list(hr.keys()) == keys assert list(hr.values()) == values assert list(hr.keys_from(b('f'))) == keys[5:] hr.close()
def test_closed_searcher(): from whoosh.reading import ReaderClosed schema = fields.Schema(key=fields.KEYWORD(stored=True, sortable=True)) with TempStorage() as st: ix = st.create_index(schema) with ix.writer() as w: w.add_document(key=u"alfa") w.add_document(key=u"bravo") w.add_document(key=u"charlie") w.add_document(key=u"delta") w.add_document(key=u"echo") s = ix.searcher() r = s.search(query.TermRange("key", "b", "d")) s.close() assert s.is_closed with pytest.raises(ReaderClosed): assert r[0]["key"] == "bravo" with pytest.raises(ReaderClosed): s.reader().column_reader("key") with pytest.raises(ReaderClosed): s.suggest("key", "brovo") s = ix.searcher() r = s.search(query.TermRange("key", "b", "d")) assert r[0] assert r[0]["key"] == "bravo" c = s.reader().column_reader("key") assert c[1] == "bravo" assert s.suggest("key", "brovo") == ["bravo"]
def test_huge_postfile(): with TempStorage("hugeindex") as st: pf = st.create_file("test.pst") gb5 = 5 * 1024 * 1024 * 1024 pf.seek(gb5) pf.write("\x00\x00\x00\x00") assert_equal(pf.tell(), gb5 + 4) fpw = FilePostingWriter(pf) format = formats.Frequency(None) offset = fpw.start(format) for i in xrange(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() assert_equal(posttotal, 10) fpw.close() pf = st.open_file("test.pst") pfr = FilePostingReader(pf, offset, format) i = 0 while pfr.is_active(): assert_equal(pfr.id(), i) assert_equal(pfr.weight(), float(i)) assert_equal(pfr.value(), struct.pack("!I", i)) pfr.next() i += 1 pf.close()
def test_bigtable(): with TempStorage("bigtable") as st: def randstring(min, max): return "".join( chr(randint(1, 255)) for _ in xrange(randint(min, max))) count = 100000 samp = dict( (randstring(1, 50), randstring(1, 50)) for _ in xrange(count)) fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) fhw.close() fhr = HashReader(st.open_file("big.hsh")) keys = list(samp.keys()) shuffle(keys) for key in keys: assert_equal(samp[key], fhr[key]) set1 = set(iteritems(samp)) set2 = set(fhr.items()) assert_equal(set1, set2) fhr.close()
def test_random_multistream(): letters = "abcdefghijklmnopqrstuvwxyz" def randstring(n): s = "".join(random.choice(letters) for _ in xrange(n)) return s.encode("latin1") domain = {} for _ in xrange(100): name = randstring(random.randint(5, 10)) value = randstring(2500) domain[name] = value outfiles = dict((name, BytesIO(value)) for name, value in domain.items()) with TempStorage() as st: msw = compound.CompoundWriter(st, buffersize=1024) mfiles = {} for name in domain: mfiles[name] = msw.create_file(name) while outfiles: name = random.choice(list(outfiles.keys())) v = outfiles[name].read(1000) mfiles[name].write(v) if len(v) < 1000: del outfiles[name] f = st.create_file("test") msw.save_as_compound(f) f = st.open_file("test") msr = compound.CompoundStorage(f) for name, value in domain.items(): assert msr.open_file(name).read() == value msr.close()
def test_threaded_filelock(): with TempStorage("threadedfilelock") as st: lock1 = st.lock("testlock") result = [] # The thread function tries to acquire the lock and then quits def fn(): lock2 = st.lock("testlock") gotit = try_for(lock2.acquire, 1.0, 0.1) if gotit: result.append(True) lock2.release() t = threading.Thread(target=fn) # Acquire the lock in this thread lock1.acquire() # Start the other thread trying to acquire the lock t.start() # Wait for a bit time.sleep(0.15) # Release the lock lock1.release() # Wait for the other thread to finish t.join() # If the other thread got the lock, it should have appended True to the # "results" list. assert result == [True]
def test_random_hash(): from string import ascii_letters as domain times = 1000 minlen = 1 maxlen = len(domain) def randstring(): s = "".join(random.sample(domain, random.randint(minlen, maxlen))) return b(s) with TempStorage("randomhash") as st: samp = dict((randstring(), randstring()) for _ in xrange(times)) hw = HashWriter(st.create_file("test.hsh")) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for k in keys: assert hr[k] == samp[k] hr.close()
def test_hash_contents(): samp = [ ('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), ] # Convert to bytes samp = set((b(k), b(v)) for k, v in samp) with TempStorage("hashcontents") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all(samp) hw.close() hr = HashReader.open(st, "test.hsh") probes = list(samp) random.shuffle(probes) for key, value in probes: assert hr[key] == value assert set(hr.keys()) == set([k for k, v in samp]) assert set(hr.values()) == set([v for k, v in samp]) assert set(hr.items()) == samp hr.close()
def test_words(): words = enlist("alfa alpaca amtrak bellow fellow fiona zebulon") with TempStorage() as st: gwrite(words, st) gr = greader(st) cur = fst.Cursor(gr) assert list(cur.flatten_strings()) == words gr.close()
def test_within(): with TempStorage() as st: gwrite(enlist("0 00 000 001 01 010 011 1 10 100 101 11 110 111"), st) gr = greader(st) s = set(fst.within(gr, "01", k=1)) gr.close() assert s == set( ["0", "00", "01", "011", "010", "001", "10", "101", "1", "11"])
def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() seg.set_doc_count(4) pdr = codec.per_document_reader(st, seg) assert pdr.doc_count_all() == 4 assert pdr.stored_fields(0) == {"a": "hello", "b": "there"} # Note: access out of order assert pdr.stored_fields(3), {"a": "alfa", "b": "bravo"} assert pdr.stored_fields(1) == {"a": "one", "b": "two", "c": "three"} sfs = list(pdr.all_stored_fields()) assert len(sfs) == 4 assert sfs == [ { "a": "hello", "b": "there" }, { "a": "one", "b": "two", "c": "three" }, {}, { "a": "alfa", "b": "bravo" }, ] pdr.close()
def test_hash(): with TempStorage("hash") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add(b("foo"), b("bar")) hw.add(b("glonk"), b("baz")) hw.close() hr = HashReader.open(st, "test.hsh") assert hr.get(b("foo")) == b("bar") assert hr.get(b("baz")) is None hr.close()
def test_random_access(): times = 1000 with TempStorage("orderedhash") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) hw.close() keys = list(range(times)) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for x in keys: assert hr[b("%08x" % x)] == b(str(x)) hr.close()
def test_filelock_simple(): with TempStorage("simplefilelock") as st: lock1 = st.lock("testlock") lock2 = st.lock("testlock") assert lock1 is not lock2 assert lock1.acquire() assert st.file_exists("testlock") assert not lock2.acquire() lock1.release() assert lock2.acquire() assert not lock1.acquire() lock2.release()
def _fst_roundtrip(domain, t): with TempStorage() as st: f = st.create_file("test") gw = fst.GraphWriter(f, vtype=t) gw.start_field("_") for key, value in domain: gw.insert(key, value) gw.finish_field() gw.close() f = st.open_file("test") gr = fst.GraphReader(f, vtype=t) cur = fst.Cursor(gr) assert list(cur.flatten_v()) == domain f.close()
def test_readwrite(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) with TempStorage("threading") as st: domain = ("alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", "quebec", "romeo", "sierra", "tango", "uniform", "victor", "whiskey", "xray", "yankee", "zulu") class WriterThread(threading.Thread): def run(self): ix = st.create_index(dir, schema) num = 0 for i in xrange(50): print(i) w = ix.writer() for _ in xrange(random.randint(1, 100)): content = u(" ").join( random.sample(domain, random.randint(5, 20))) w.add_document(id=text_type(num), content=content) num += 1 w.commit() time.sleep(0.1) class SearcherThread(threading.Thread): def run(self): print(self.name + " starting") for _ in xrange(10): ix = st.open_index() s = ix.searcher() q = query.Term("content", random.choice(domain)) s.search(q, limit=10) s.close() ix.close() time.sleep(0.1) print(self.name + " done") wt = WriterThread() wt.start() time.sleep(0.5) for _ in xrange(20): SearcherThread().start() time.sleep(0.5) wt.join()
def test_version_in(): with TempStorage("versionin") as st: assert not index.exists(st) schema = fields.Schema(text=fields.TEXT) ix = st.create_index(schema) assert index.exists(st) assert ix.is_empty() v = index.version(st) assert v[0] == __version__ assert v[1] == index._CURRENT_TOC_VERSION with ix.writer() as w: w.add_document(text=u("alfa")) assert not ix.is_empty()
def test_fields(): with TempStorage() as st: f = st.create_file("test") gw = fst.GraphWriter(f) gw.start_field("f1") gw.insert("a") gw.insert("aa") gw.insert("ab") gw.finish_field() gw.start_field("f2") gw.insert("ba") gw.insert("baa") gw.insert("bab") gw.close() gr = fst.GraphReader(st.open_file("test")) cur1 = fst.Cursor(gr, gr.root("f1")) cur2 = fst.Cursor(gr, gr.root("f2")) assert list(cur1.flatten_strings()) == ["a", "aa", "ab"] assert list(cur2.flatten_strings()) == ["ba", "baa", "bab"] gr.close()
def _roundtrip(content, format_, astype, ana=None): with TempStorage("roundtrip") as st: codec = default_codec() seg = codec.new_segment(st, "") ana = ana or analysis.StandardAnalyzer() field = fields.FieldType(format=format_, analyzer=ana) fw = codec.field_writer(st, seg) fw.start_field("f1", field) for text, _, weight, valuestring in sorted(field.index(content)): fw.start_term(text) fw.add(0, weight, valuestring, None) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ps = [] for fieldname, btext in tr.terms(): m = tr.matcher(fieldname, btext, format_) ps.append((field.from_bytes(btext), m.value_as(astype))) tr.close() return ps
def test_random(): def randstring(): length = random.randint(1, 5) a = array("B", (random.randint(0, 255) for _ in xrange(length))) return array_tobytes(a) keys = sorted(randstring() for _ in xrange(100)) with TempStorage() as st: gwrite(keys, st) gr = greader(st) cur = fst.Cursor(gr) s1 = cur.flatten() s2 = sorted(set(keys)) for i, (k1, k2) in enumerate(zip(s1, s2)): assert k1 == k2, "%s: %r != %r" % (i, k1, k2) sample = list(keys) random.shuffle(sample) for key in sample: cur.reset() cur.find_path(key) assert cur.prefix_bytes() == key gr.close()
def test_simple_compound_mmap(): with TempStorage("compound") as st: assert st.supports_mmap _test_simple_compound(st)