示例#1
0
 def test_ignore_case_match_longest(self):
     if sys.version_info.major < 3:
         return
     trie = Trie(ignore_case=True)
     ids = {w: trie.insert(w) for w in [u"aİİ", u"aai̇", u"aai̇bİ"]}
     matches = list(trie.match_longest(u"aaİ aai̇bİaa"))
     self.assertEqual(matches,
                      [(ids[u"aai̇"], 0, len(u"aaİ")),
                       (ids[u"aai̇bİ"], len(u"aaİ "), len(u"aaİ aai̇bİ"))])
     sep = set([ord(" ")])  # space as seperator
     matches = list(trie.match_longest(u"aaİ aai̇bİaa", sep))
     self.assertEqual(matches, [
         (ids[u"aai̇"], 0, len(u"aaİ")),
     ])
示例#2
0
 def test_match_longest(self):
     trie = Trie()
     ids = {
         w: trie.insert(w)
         for w in
         [u"New York", u"New", u"York", u"York City", u"City", u"City is"]
     }
     matches = list(trie.match_longest(u"New York City isA"))
     self.assertEqual(
         matches,
         [(ids[u"New York"], 0, len(u"New York")),
          (ids[u"City is"], len(u"New York "), len(u"New York City is"))])
     sep = set([ord(" ")])  # space as seperator
     matches = list(trie.match_longest(u"New York City isA", sep))
     self.assertEqual(
         matches,
         [(ids[u"New York"], 0, len(u"New York")),
          (ids[u"City"], len(u"New York "), len(u"New York City"))])
示例#3
0
 def test_match_words(self):
     dir_ = os.path.dirname(__file__)
     trie = Trie()
     ids = []
     with open(os.path.join(dir_, "../bench/words.txt")) as fi:
         for l in fi:
             l = l.strip()
             if isinstance(l, bytes):
                 l = l.decode("utf8")
             if len(l) > 0:
                 ids.append(trie.insert(l))
     with open(os.path.join(dir_, "../bench/words.txt")) as fi:
         txt = fi.read()
         if isinstance(txt, bytes):
             txt = txt.decode("utf8")
     sep = set([ord("\n")])
     matched = []
     for v, start, end in trie.match_longest(txt, sep):
         matched.append(v)
         self.assertEqual(txt[start:end], trie[v])
     self.assertEqual(matched, ids)
示例#4
0
class WikiPageDetector:
    def __init__(self, pages: Iterable[str] = None):
        self._map = None
        self._trie = None
        if pages is not None:
            self.build(pages)

    @staticmethod
    def load(path: Path):
        wpd = WikiPageDetector()
        wpd._map = pickle_load(path / "wpd_map.gz")
        with (path / "wpd_trie").open("r+b") as bf:
            wpd._trie = Trie.from_buff(mmap(bf.fileno(), 0), copy=False)
        return wpd

    def dump(self, path: Path):
        self._trie.save(str(path / "wpd_trie"))
        pickle_dump(self._map, path / "wpd_map.gz", compress=True)

    def build(self, pages: Iterable[str]):
        key2titles = {}
        for page in pages:
            if not page:
                continue
            key = _clean_title(page).lower()
            if not key:
                key = page
            titles = key2titles.setdefault(key, [])
            titles.append(page)
        mapping = {}
        self._trie = Trie(ignore_case=True)
        for key in key2titles:
            id_ = self._trie.insert(key)
            mapping.setdefault(id_, tuple(key2titles[key]))
        self._map = tuple([mapping.get(i) for i in range(max(mapping) + 1)])

    def find_pages(self, text: str):
        def iter_matches(source):
            ac_seps = set([ord(p) for p in _XP_SEPS.findall(source)])
            for id_, start_idx, end_idx in self._trie.match_longest(
                source, ac_seps
            ):
                yield (start_idx, end_idx, self._map[id_])

        for match in iter_matches(text):
            yield match
            match_text = text[match[0] : match[1]]
            seps = list(_XP_SEPS.finditer(match_text))
            if len(seps) < 1:
                continue
            tokens = []
            last_end = 0
            for sep in seps:
                token = match_text[last_end : sep.start()]
                start = last_end
                last_end = sep.end()
                if len(token) < 2 and not token.isalnum():
                    continue
                tokens.append((start, token))
            tokens.append((last_end, match_text[last_end:]))
            num_tokens = len(tokens)
            for s, e in combinations(range(num_tokens + 1), 2):
                if s == 0 and e == num_tokens:
                    continue
                e -= 1
                submatches = set()
                start = tokens[s][0]
                end = tokens[e][0] + len(tokens[e][1])
                subtext = match_text[start:end]
                start += match[0]
                for sidx, eidx, pages in iter_matches(subtext):
                    coords = (sidx + start, eidx + start)
                    if coords in submatches:
                        continue
                    submatches.add(coords)
                    yield (*coords, pages)