def wordlist_to_graph_file(wordlist, dbfile, fieldname="_", strip=True): """Writes a word graph file from a list of words. >>> # Open a word list file with one word on each line, and write the >>> # word graph to a graph file >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg") :param wordlist: an iterable containing the words for the graph. The words must be in sorted order. :param dbfile: a filename string or file-like object to write the word graph to. This function will close the file. """ from whoosh.filedb.structfile import StructFile if isinstance(dbfile, string_type): dbfile = open(dbfile, "wb") if not isinstance(dbfile, StructFile): dbfile = StructFile(dbfile) gw = dawg.GraphWriter(dbfile) gw.start_field(fieldname) for word in wordlist: if strip: word = word.strip() gw.insert(word) gw.finish_field() gw.close()
def test_empty_key(): gw = dawg.GraphWriter(RamStorage().create_file("test")) gw.start_field("_") assert_raises(KeyError, gw.insert, b("")) assert_raises(KeyError, gw.insert, "") assert_raises(KeyError, gw.insert, u("")) assert_raises(KeyError, gw.insert, [])
def gwrite(keys, st=None): st = st or RamStorage() f = st.create_file("test") gw = dawg.GraphWriter(f) gw.start_field("_") for key in keys: gw.insert(key) gw.finish_field() gw.close() return st
def test_insert_bytes(): # This test is only meaningful on Python 3 domain = [b("alfa"), b("bravo"), b("charlie")] st = RamStorage() gw = dawg.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = dawg.GraphReader(st.open_file("test")).cursor() assert_equal(list(cur.flatten()), domain)
def _fst_roundtrip(domain, t): with TempStorage() as st: f = st.create_file("test") gw = dawg.GraphWriter(f, vtype=t) gw.start_field("_") for key, value in domain: gw.insert(key, value) gw.finish_field() gw.close() f = st.open_file("test") gr = dawg.GraphReader(f, vtype=t) cur = dawg.Cursor(gr) assert_equal(list(cur.flatten_v()), domain) f.close()
def test_insert_unicode(): domain = [ u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = dawg.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = dawg.GraphReader(st.open_file("test")).cursor() assert_equal(list(cur.flatten_strings()), domain)
def test_within_unicode(): domain = [ u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = dawg.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() gr = dawg.GraphReader(st.open_file("test")) s = list(dawg.within(gr, u("\uc774.\ud76c"))) assert_equal(s, [u("\uc774\uc124\ud76c")])
def test_fields(): with TempStorage() as st: f = st.create_file("test") gw = dawg.GraphWriter(f) gw.start_field("f1") gw.insert("a") gw.insert("aa") gw.insert("ab") gw.finish_field() gw.start_field("f2") gw.insert("ba") gw.insert("baa") gw.insert("bab") gw.close() gr = dawg.GraphReader(st.open_file("test")) cur1 = dawg.Cursor(gr, gr.root("f1")) cur2 = dawg.Cursor(gr, gr.root("f2")) assert_equal(list(cur1.flatten_strings()), ["a", "aa", "ab"]) assert_equal(list(cur2.flatten_strings()), ["ba", "baa", "bab"]) gr.close()
def add_spelling(ix, fieldnames, commit=True): """Adds spelling files to an existing index that was created without them, and modifies the schema so the given fields have the ``spelling`` attribute. Only works on filedb indexes. >>> ix = index.open_dir("testindex") >>> add_spelling(ix, ["content", "tags"]) :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object. :param fieldnames: a list of field names to create word graphs for. :param force: if True, overwrites existing word graph files. This is only useful for debugging. """ from whoosh.filedb.filereading import SegmentReader from whoosh.support import dawg writer = ix.writer() storage = writer.storage schema = writer.schema segments = writer.segments for segment in segments: r = SegmentReader(storage, schema, segment) f = segment.create_file(storage, ".dag") gw = dawg.GraphWriter(f) for fieldname in fieldnames: gw.start_field(fieldname) for word in r.lexicon(fieldname): gw.insert(word) gw.finish_field() gw.close() for fieldname in fieldnames: schema[fieldname].spelling = True if commit: writer.commit(merge=False)
def test_keys_out_of_order(): f = RamStorage().create_file("test") gw = dawg.GraphWriter(f) gw.start_field("test") gw.insert("alfa") assert_raises(KeyError, gw.insert, "abba")
def test_empty_fieldname(): gw = dawg.GraphWriter(RamStorage().create_file("test")) assert_raises(ValueError, gw.start_field, "") assert_raises(ValueError, gw.start_field, None) assert_raises(ValueError, gw.start_field, 0)