def test_within_delete(): st = gwrite(enlist("abc def ghi")) gr = greader(st) assert_equal(set(dawg.within(gr, "df")), set(["def"])) st = gwrite(enlist("0")) gr = greader(st) assert_equal(list(dawg.within(gr, "01")), ["0"])
def test_within_replace(): st = gwrite(enlist("abc def ghi")) gr = greader(st) assert_equal(set(dawg.within(gr, "dez")), set(["def"])) st = gwrite(enlist("00 01 10 11")) gr = greader(st) s = set(dawg.within(gr, "00")) assert_equal(s, set(["00", "10", "01"]), s)
def terms_within(self, fieldname, text, maxdist, prefix=0, seen=None): """Returns a generator of words in the given field within ``maxdist`` Damerau-Levenshtein edit distance of the given text. :param maxdist: the maximum edit distance. :param prefix: require suggestions to share a prefix of this length with the given word. This is often justifiable since most misspellings do not involve the first letter of the word. Using a prefix dramatically decreases the time it takes to generate the list of words. :param seen: an optional set object. Words that appear in the set will not be yielded. """ if self.has_word_graph(fieldname): node = self.word_graph(fieldname) for word in within(node, text, maxdist, prefix=prefix, seen=seen): yield word else: if seen is None: seen = set() for word in self.expand_prefix(fieldname, text[:prefix]): if word in seen: continue if (word == text or distance(word, text, limit=maxdist) <= maxdist): yield word seen.add(word)
def terms_within(self, fieldname, text, maxdist, prefix=0, seen=None): """Returns a generator of words in the given field within ``maxdist`` Damerau-Levenshtein edit distance of the given text. :param maxdist: the maximum edit distance. :param prefix: require suggestions to share a prefix of this length with the given word. This is often justifiable since most misspellings do not involve the first letter of the word. Using a prefix dramatically decreases the time it takes to generate the list of words. :param seen: an optional set object. Words that appear in the set will not be yielded. """ if self.has_word_graph(fieldname): node = self.word_graph(fieldname) for word in within(node, text, maxdist, prefix=prefix, seen=seen): yield word else: if seen is None: seen = set() for word in self.expand_prefix(fieldname, text[:prefix]): if word in seen: continue k = distance(word, text, limit=maxdist) if k <= maxdist: yield word seen.add(word)
def terms_within(self, fieldname, text, maxdist, prefix=0): if not self.has_word_graph(fieldname): # This reader doesn't have a graph stored, use the slow method return IndexReader.terms_within(self, fieldname, text, maxdist, prefix=prefix) return dawg.within(self._graph, text, k=maxdist, prefix=prefix, address=self._graph.root(fieldname))
def test_within(): with TempStorage() as st: gwrite(enlist("0 00 000 001 01 010 011 1 10 100 101 11 110 111"), st) gr = greader(st) s = set(dawg.within(gr, "01", k=1)) gr.close() assert_equal( s, set(["0", "00", "01", "011", "010", "001", "10", "101", "1", "11"]))
def _suggestions(self, text, maxdist, prefix, seen): for sug in dawg.within(self.word_graph, text, maxdist, prefix=prefix, seen=seen): # Higher scores are better, so negate the edit distance yield (0 - maxdist, sug)
def test_within_unicode(): domain = [ u("\u280b\u2817\u2801\u281d\u2809\u2811"), u("\u65e5\u672c"), u("\uc774\uc124\ud76c"), ] st = RamStorage() gw = dawg.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() gr = dawg.GraphReader(st.open_file("test")) s = list(dawg.within(gr, u("\uc774.\ud76c"))) assert_equal(s, [u("\uc774\uc124\ud76c")])
def test_within_prefix(): st = gwrite(enlist("aabc aadc babc badc")) gr = greader(st) s = set(dawg.within(gr, "aaxc", prefix=2)) assert_equal(s, set(["aabc", "aadc"]))
def test_within_k2(): st = gwrite(enlist("abc bac cba")) gr = greader(st) s = set(dawg.within(gr, "cb", k=2)) assert_equal(s, set(["abc", "cba"]))
def test_within_transpose(): st = gwrite(enlist("abc def ghi")) gr = greader(st) s = set(dawg.within(gr, "dfe")) assert_equal(s, set(["def"]))
def test_within_insert(): st = gwrite(enlist("00 01 10 11")) gr = greader(st) s = set(dawg.within(gr, "0")) assert_equal(s, set(["00", "01", "10"]))
def test_within_match(): st = gwrite(enlist("abc def ghi")) gr = greader(st) assert_equal(set(dawg.within(gr, "def")), set(["def"]))
def _suggestions(self, text, maxdist, prefix, seen): ranking = self.ranking for sug in dawg.within(self.word_graph, text, maxdist, prefix=prefix, seen=seen): yield (ranking(sug, maxdist), sug)