class GNTagger: def __init__(self, fn_geonames, fn_trie, fn_value2IDs): out("loading geonames index...\n") self.gn_index = GeonamesIndex(fn_geonames) out("loading trie...\n") self.trie = TrieDict.load(fn_trie) out("loading value2IDs list...\n") self.value2IDs = load_IDs(fn_value2IDs) out("done\n") def parse(self, text, bound_chars): text = text.strip().decode("utf-8") # bound_chars = " .,;:_-!?=()[]{}'\"$%&" matches = self.trie.match(text, bound_chars=bound_chars) result = {"success": 1, "n_matches": len(matches), "matches": []} for match in matches: key, value, pos = match result["matches"].append({}) rec = result["matches"][-1] rec["idx"] = (pos-len(key)+1, pos), rec["token"] = key rec["gnrecs"] = [] IDs = [int(ID) for ID in self.value2IDs[value].split(",")] for ID in IDs: gnrec = self.gn_index.get(ID) rec["gnrecs"].append(gnrec.split("\t")) return result
def __init__(self, fn_geonames, fn_trie, fn_value2IDs): out("loading geonames index...\n") self.gn_index = GeonamesIndex(fn_geonames) out("loading trie...\n") self.trie = TrieDict.load(fn_trie) out("loading value2IDs list...\n") self.value2IDs = load_IDs(fn_value2IDs) out("done\n")