def test_extras(): st = RamStorage() hw = HashWriter(st.create_file("test")) hw.extras["test"] = 100 hw.extras["blah"] = "foo" hw.close() hr = HashReader(st.open_file("test"), st.file_length("test")) assert hr.extras["test"] == 100 assert hr.extras["blah"] == "foo" hr.close() hw = OrderedHashWriter(st.create_file("test")) hw.extras["test"] = 100 hw.extras["blah"] = "foo" hw.close() hr = HashReader(st.open_file("test"), st.file_length("test")) assert hr.extras["test"] == 100 assert hr.extras["blah"] == "foo" hr.close() hr = OrderedHashReader(st.open_file("test"), st.file_length("test")) assert hr.extras["test"] == 100 assert hr.extras["blah"] == "foo" hr.close()
def test_wordfile(): import os.path files = os.listdir(".") testdir = "tests" fname = "english-words.10.gz" if testdir in files: path = os.path.join(testdir, fname) elif fname in files: path = fname else: return if not os.path.exists(path): return wordfile = gzip.open(path, "r") cor = spelling.GraphCorrector.from_word_list(word.decode("latin-1") for word in wordfile) wordfile.close() #dawg.dump_dawg(cor.word_graph) assert_equal(cor.suggest("specail"), ["special"]) st = RamStorage() gf = st.create_file("test.dawg") cor.to_file(gf) gf = st.open_file("test.dawg") cor = spelling.GraphCorrector.from_graph_file(gf) assert_equal(cor.suggest("specail", maxdist=1), ["special"]) gf.close()
def test_wordfile(): import os.path files = os.listdir(".") testdir = "tests" fname = "english-words.10.gz" if testdir in files: path = os.path.join(testdir, fname) elif fname in files: path = fname else: return if not os.path.exists(path): return wordfile = gzip.open(path, "r") cor = spelling.GraphCorrector.from_word_list( word.decode("latin-1") for word in wordfile) wordfile.close() #dawg.dump_dawg(cor.word_graph) assert_equal(cor.suggest("specail"), ["special"]) st = RamStorage() gf = st.create_file("test.dawg") cor.to_file(gf) gf = st.open_file("test.dawg") cor = spelling.GraphCorrector.from_graph_file(gf) assert_equal(cor.suggest("specail", maxdist=1), ["special"]) gf.close()
def test_ondisk(): bs = BitSet([10, 11, 30, 50, 80]) st = RamStorage() f = st.create_file("test") size = bs.to_disk(f) f.close() f = st.open_file("test") b = OnDiskBitSet(f, 0, size) assert list(b) == list(bs) assert b.after(0) == 10 assert b.after(10) == 11 assert b.after(80) is None assert b.after(99) is None assert b.before(0) is None assert b.before(99) == 80 assert b.before(80) == 50 assert b.before(10) is None f.seek(0) b = BitSet.from_disk(f, size) assert list(b) == list(bs)
def _rt(c, values, default): # Continuous st = RamStorage() f = st.create_file("test1") f.write(b("hello")) w = c.writer(f) for docnum, v in enumerate(values): w.add(docnum, v) w.finish(len(values)) length = f.tell() - 5 f.close() f = st.open_file("test1") r = c.reader(f, 5, length, len(values)) assert values == list(r) for x in range(len(values)): assert values[x] == r[x] f.close() # Sparse doccount = len(values) * 7 + 15 target = [default] * doccount f = st.create_file("test2") f.write(b("hello")) w = c.writer(f) for docnum, v in izip(xrange(10, doccount, 7), values): target[docnum] = v w.add(docnum, v) w.finish(doccount) length = f.tell() - 5 f.close() f = st.open_file("test2") r = c.reader(f, 5, length, doccount) assert target == list(r) for x in range(doccount): assert target[x] == r[x] lr = r.load() assert target == list(lr) f.close()
def test_find_self(): wordlist = sorted(u("book bake bike bone").split()) st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(wordlist, f) gr = fst.GraphReader(st.open_file("test")) gc = spelling.GraphCorrector(gr) assert gc.suggest("book")[0] != "book" assert gc.suggest("bake")[0] != "bake" assert gc.suggest("bike")[0] != "bike" assert gc.suggest("bone")[0] != "bone"
def test_insert_bytes(): # This test is only meaningful on Python 3 domain = [b("alfa"), b("bravo"), b("charlie")] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = fst.GraphReader(st.open_file("test")).cursor() assert list(cur.flatten()) == domain
def test_checksum_file(): from whoosh.filedb.structfile import ChecksumFile from zlib import crc32 def wr(f): f.write(b("Testing")) f.write_int(-100) f.write_varint(10395) f.write_string(b("Hello")) f.write_ushort(32959) st = RamStorage() # Write a file normally f = st.create_file("control") wr(f) f.close() # Checksum the contents f = st.open_file("control") target = crc32(f.read()) & 0xffffffff f.close() # Write a file with checksumming f = st.create_file("test") cf = ChecksumFile(f) wr(cf) assert cf.checksum() == target f.close() # Read the file with checksumming f = st.open_file("test") cf = ChecksumFile(f) assert cf.read(7) == b("Testing") assert cf.read_int() == -100 assert cf.read_varint() == 10395 assert cf.read_string() == b("Hello") assert cf.read_ushort() == 32959 assert cf.checksum() == target cf.close()
def test_insert_unicode(): domain = [u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = fst.GraphReader(st.open_file("test")).cursor() assert list(cur.flatten_strings()) == domain
def test_within_unicode(): domain = [u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() gr = fst.GraphReader(st.open_file("test")) s = list(fst.within(gr, u("\uc774.\ud76c"))) assert s == [u("\uc774\uc124\ud76c")]
def test_termindex(): terms = [("a", "alfa"), ("a", "bravo"), ("a", "charlie"), ("a", "delta"), ("b", "able"), ("b", "baker"), ("b", "dog"), ("b", "easy")] st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for i, t in enumerate(terms): tw.add(t, FileTermInfo(1.0, i)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for i, (t1, t2) in enumerate(zip(tr.keys(), terms)): assert_equal(t1, t2) ti = tr.get(t1) assert_equal(ti.weight(), 1.0) assert_equal(ti.doc_frequency(), i)
def test_insert_unicode(): domain = [ u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = fst.GraphReader(st.open_file("test")).cursor() assert list(cur.flatten_strings()) == domain
def test_within_unicode(): domain = [ u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() gr = fst.GraphReader(st.open_file("test")) s = list(fst.within(gr, u("\uc774.\ud76c"))) assert s == [u("\uc774\uc124\ud76c")]
def test_types(): st = RamStorage() types = ((fst.IntValues, 100, 0), (fst.BytesValues, b('abc'), b('')), (fst.ArrayValues("i"), array("i", [0, 123, 42]), array("i")), (fst.IntListValues, [0, 6, 97], [])) for t, v, z in types: assert t.common(None, v) is None assert t.common(v, None) is None assert t.common(None, None) is None assert t.subtract(v, None) == v assert t.subtract(None, v) is None assert t.subtract(None, None) is None assert t.add(v, None) == v assert t.add(None, v) == v assert t.add(None, None) is None f = st.create_file("test") t.write(f, v) t.write(f, z) f.close() f = st.open_file("test") assert t.read(f) == v assert t.read(f) == z assert fst.IntValues.common(100, 20) == 20 assert fst.IntValues.add(20, 80) == 100 assert fst.IntValues.subtract(100, 80) == 20 assert fst.BytesValues.common(b("abc"), b("abc")) == b("abc") assert fst.BytesValues.common(b("abcde"), b("abfgh")) == b("ab") assert fst.BytesValues.common(b("abcde"), b("ab")) == b("ab") assert fst.BytesValues.common(b("ab"), b("abcde")) == b("ab") assert fst.BytesValues.common(None, b("abcde")) is None assert fst.BytesValues.common(b("ab"), None) is None a1 = array("i", [0, 12, 123, 42]) a2 = array("i", [0, 12, 420]) cm = array("i", [0, 12]) assert fst.ArrayValues.common(a1, a1) == a1 assert fst.ArrayValues.common(a1, a2) == cm assert fst.ArrayValues.common(a2, a1) == cm assert fst.ArrayValues.common(None, a1) is None assert fst.ArrayValues.common(a2, None) is None
def test_multistream(): domain = [("a", "12345"), ("b", "abc"), ("c", "AaBbC"), ("a", "678"), ("c", "cDdEeF"), ("b", "defgh"), ("b", "ijk"), ("c", "fGgHh"), ("a", "9abc")] st = RamStorage() msw = compound.CompoundWriter(st) files = dict((name, msw.create_file(name)) for name in "abc") for name, data in domain: files[name].write(b(data)) f = st.create_file("test") msw.save_as_compound(f) f = st.open_file("test") msr = compound.CompoundStorage(f) assert msr.open_file("a").read() == b("123456789abc") assert msr.open_file("b").read() == b("abcdefghijk") assert msr.open_file("c").read() == b("AaBbCcDdEeFfGgHh")
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_token(): return "".join(unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20)) domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)]) st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for term in domain: tw.add(term, FileTermInfo(1.0, 1)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for term in domain: assert term in tr
def test_block(): st = RamStorage() f = st.create_file("postfile") b = current(f, 0) b.append(0, 1.0, '', 1) b.append(1, 2.0, '', 2) b.append(2, 12.0, '', 6) b.append(5, 6.5, '', 420) assert b assert_equal(len(b), 4) assert_equal(list(b.ids), [0, 1, 2, 5]) assert_equal(list(b.weights), [1.0, 2.0, 12.0, 6.5]) assert_equal(b.values, None) assert_equal(b.min_length(), 1) assert_equal(b.max_length(), byte_to_length(length_to_byte(420))) assert_equal(b.max_weight(), 12.0) assert_equal(b.max_wol(), 2.0) ti = FileTermInfo() ti.add_block(b) assert_equal(ti.weight(), 21.5) assert_equal(ti.doc_frequency(), 4) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), byte_to_length(length_to_byte(420))) assert_equal(ti.max_weight(), 12.0) assert_equal(ti.max_wol(), 2.0) b.write(compression=3) f.close() f = st.open_file("postfile") bb = current.from_file(f, 0) bb.read_ids() assert_equal(list(bb.ids), [0, 1, 2, 5]) bb.read_weights() assert_equal(list(bb.weights), [1.0, 2.0, 12.0, 6.5]) bb.read_values() assert_equal(b.values, None) assert_equal(bb.min_length(), 1) assert_equal(bb.max_length(), byte_to_length(length_to_byte(420))) assert_equal(bb.max_weight(), 12.0) assert_equal(bb.max_wol(), 2.0)
def test_types(): st = RamStorage() types = ((dawg.IntValues, 100, 0), (dawg.BytesValues, b('abc'), b('')), (dawg.ArrayValues("i"), array("i", [0, 123, 42]), array("i")), (dawg.IntListValues, [0, 6, 97], [])) for t, v, z in types: assert_equal(t.common(None, v), None) assert_equal(t.common(v, None), None) assert_equal(t.common(None, None), None) assert_equal(t.subtract(v, None), v) assert_equal(t.subtract(None, v), None) assert_equal(t.subtract(None, None), None) assert_equal(t.add(v, None), v) assert_equal(t.add(None, v), v) assert_equal(t.add(None, None), None) f = st.create_file("test") t.write(f, v) t.write(f, z) f.close() f = st.open_file("test") assert_equal(t.read(f), v) assert_equal(t.read(f), z) assert_equal(dawg.IntValues.common(100, 20), 20) assert_equal(dawg.IntValues.add(20, 80), 100) assert_equal(dawg.IntValues.subtract(100, 80), 20) assert_equal(dawg.BytesValues.common(b("abc"), b("abc")), b("abc")) assert_equal(dawg.BytesValues.common(b("abcde"), b("abfgh")), b("ab")) assert_equal(dawg.BytesValues.common(b("abcde"), b("ab")), b("ab")) assert_equal(dawg.BytesValues.common(b("ab"), b("abcde")), b("ab")) assert_equal(dawg.BytesValues.common(None, b("abcde")), None) assert_equal(dawg.BytesValues.common(b("ab"), None), None) a1 = array("i", [0, 12, 123, 42]) a2 = array("i", [0, 12, 420]) cm = array("i", [0, 12]) assert_equal(dawg.ArrayValues.common(a1, a1), a1) assert_equal(dawg.ArrayValues.common(a1, a2), cm) assert_equal(dawg.ArrayValues.common(a2, a1), cm) assert_equal(dawg.ArrayValues.common(None, a1), None) assert_equal(dawg.ArrayValues.common(a2, None), None)
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_token(): return "".join( unichr(random.randint(0, 0xd7ff)) for _ in xrange(1, 20)) domain = sorted([(random_fieldname(), random_token()) for _ in xrange(1000)]) st = RamStorage() tw = TermIndexWriter(st.create_file("test.trm")) for term in domain: tw.add(term, FileTermInfo(1.0, 1)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) for term in domain: assert term in tr
def rw(size): st = RamStorage() f = st.create_file("test") cw = col.writer(f) for i in xrange(size): cw.add(i, hex(i).encode("latin1")) cw.finish(size) length = f.tell() f.close() f = st.open_file("test") cr = col.reader(f, 0, length, size) for i in xrange(size): v = cr[i] # Column ignores additional unique values after 65535 if i <= 65535 - 1: assert v == hex(i).encode("latin1") else: assert v == b('') f.close()
def words_to_corrector(words): st = RamStorage() f = st.create_file("test") spelling.wordlist_to_graph_file(words, f) f = st.open_file("test") return spelling.GraphCorrector(fst.GraphReader(f))