예제 #1
0
class GNTagger:
    def __init__(self, fn_geonames, fn_trie, fn_value2IDs):
        out("loading geonames index...\n")
        self.gn_index = GeonamesIndex(fn_geonames)
        out("loading trie...\n")
        self.trie = TrieDict.load(fn_trie)
        out("loading value2IDs list...\n")
        self.value2IDs = load_IDs(fn_value2IDs)
        out("done\n")

    def parse(self, text, bound_chars):
        text = text.strip().decode("utf-8")
        # bound_chars = " .,;:_-!?=()[]{}'\"$%&"
        matches = self.trie.match(text, bound_chars=bound_chars)
        result = {"success": 1, "n_matches": len(matches), "matches": []}
        for match in matches:
            key, value, pos = match
            result["matches"].append({})
            rec = result["matches"][-1]
            rec["idx"] = (pos-len(key)+1, pos), 
            rec["token"] = key 
            rec["gnrecs"] = []
            IDs = [int(ID) for ID in self.value2IDs[value].split(",")]
            for ID in IDs:
                gnrec = self.gn_index.get(ID)
                rec["gnrecs"].append(gnrec.split("\t"))
        return result
예제 #2
0
 def __init__(self, fn_geonames, fn_trie, fn_value2IDs):
     out("loading geonames index...\n")
     self.gn_index = GeonamesIndex(fn_geonames)
     out("loading trie...\n")
     self.trie = TrieDict.load(fn_trie)
     out("loading value2IDs list...\n")
     self.value2IDs = load_IDs(fn_value2IDs)
     out("done\n")