class CharCategory: __slots__ = ['cc_rd', 'cat', 'c2c_rd', 'char2id', 'eql_masks'] def __init__(self, path, bigendian=False, use_mmap=None): self.cc_rd = DictReader(path + "/char.category", bigendian, use_mmap) with self.cc_rd as r: self.cat = self.convert_categories(r.get_intarray()) self.c2c_rd = DictReader(path + "/code2category", bigendian, use_mmap) with self.c2c_rd as r: self.char2id = r.get_intarray(r.size() // 4 // 2) self.eql_masks = r.get_intarray(r.size() // 4 // 2) def release(self): del self.cat del self.char2id del self.eql_masks self.cc_rd.release() del self.cc_rd self.c2c_rd.release() del self.c2c_rd def category(self, code): return self.cat[self.char2id[code]] def is_compatible(self, code1, code2): return (self.eql_masks[code1] & self.eql_masks[code2]) != 0 def convert_categories(self, d): return [ Category(d[i], d[i + 1], d[i + 2], d[i + 3]) for i in range(0, len(d), 4) ]
def __init__(self, path, bigendian=False, use_mmap=None): self.cc_rd = DictReader(path + "/char.category", bigendian, use_mmap) with self.cc_rd as r: self.cat = self.convert_categories(r.get_intarray()) self.c2c_rd = DictReader(path + "/code2category", bigendian, use_mmap) with self.c2c_rd as r: self.char2id = r.get_intarray(r.size() // 4 // 2) self.eql_masks = r.get_intarray(r.size() // 4 // 2)
class Matrix: """ 形態素の連接コスト表を扱うクラス """ __slots__ = ['rd', 'left_size', 'matrix'] def __init__(self, path, bigendian=False, use_mmap=None): self.rd = DictReader(path + "/matrix.bin", bigendian, use_mmap) with self.rd as r: self.left_size = r.get_int() right_size = r.get_int() self.matrix = r.get_shortarray(self.left_size * right_size) def release(self): del self.matrix self.rd.release() def linkcost(self, left_id, right_id): """ 形態素同士の連接コストを求める """ return self.matrix[right_id * self.left_size + left_id]
def __init__(self, path, bigendian=False, splitted=False, use_mmap=None): self.splitted = splitted self.trie = Searcher(path + "/word2id", bigendian, use_mmap) if splitted: paths = sorted(glob.glob(path + "/word.dat.*")) self.data = util.get_chararray_multi(paths, bigendian) else: self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap) with self.wd_rd as r: self.data = r.get_chararray() self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap) with self.wa_rd as r: self.indices = r.get_intarray() self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap) with self.wi_rd as r: wc = r.size() // (4 + 2 + 2 + 2) self.offsets = r.get_intarray(wc) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.left_ids = r.get_shortarray(wc) """ leftIds[単語ID] = 単語の左文脈ID """ self.right_ids = r.get_shortarray(wc) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = r.get_shortarray(wc) """ consts[単語ID] = 単語のコスト """
def __init__(self, path, bigendian=False, use_mmap=None): """ instantiate a DoubleArray Searcher @param filepath path of DoubleArray @param mmap use mmap or not; None: depends on environment """ self.rd = DictReader(path, bigendian, use_mmap) with self.rd as r: node_size = r.get_int() tind_size = r.get_int() tail_size = r.get_int() self.num_keys = tind_size self.begs = r.get_intarray(tind_size) self.base = r.get_intarray(node_size) self.lens = r.get_shortarray(tind_size) self.chck = r.get_chararray(node_size) self.tail = r.get_chararray(tail_size)
class Searcher: """ DoubleArray検索用のクラス """ __slots__ = ['rd', 'num_keys', 'begs', 'base', 'lens', 'chck', 'tail'] def __init__(self, path, bigendian=False, use_mmap=None): """ instantiate a DoubleArray Searcher @param filepath path of DoubleArray @param mmap use mmap or not; None: depends on environment """ self.rd = DictReader(path, bigendian, use_mmap) with self.rd as r: node_size = r.get_int() tind_size = r.get_int() tail_size = r.get_int() self.num_keys = tind_size self.begs = r.get_intarray(tind_size) self.base = r.get_intarray(node_size) self.lens = r.get_shortarray(tind_size) self.chck = r.get_chararray(node_size) self.tail = r.get_chararray(tail_size) def release(self): del self.begs del self.base del self.lens del self.chck del self.tail self.rd.release() del self.rd def size(self): """ DoubleArrayに格納されているキーの数を返す @return DoubleArrayに格納されているキー数 """ return self.num_keys def search(self, key): """ キーを検索する @param key 検索対象のキー文字列 @return キーが見つかった場合はそのIDを、見つからなかった場合は-1を返す """ begs = self.begs tail = self.tail lens = self.lens base = self.base chck = self.chck node = base[0] def exists(kin, node): node_id = base_id(node) beg = begs[node_id] s = tail[beg:beg + lens[node_id]] return kin.rest().equals(s) kin = KeyStream(key) code = kin.read() while 1: idx = node + code node = base[idx] if chck[idx] == code: if node >= 0: continue elif kin.eos() or exists(kin, node): return base_id(node) return -1 # with, iterator def commonprefix_search(self, key, start, fn): """ common-prefix検索を行う 条件に一致するキーが見つかる度に、fn.call(...)メソッドが呼び出される @param key 検索対象のキー文字列 @param start 検索対象となるキー文字列の最初の添字 @param fn 一致を検出した場合に呼び出されるメソッドを定義したコールバック関数 """ base = self.base chck = self.chck begs = self.begs tail = self.tail lens = self.lens node = base[0] offset = -1 kin = KeyStream(key, start) def call_if_key_including(kin, node, start, offset, fn): node_id = base_id(node) l = lens[node_id] beg = begs[node_id] prefix = tail[beg:beg+l] if kin.startswith(prefix): fn(start, offset + l + 1, node_id) while 1: code = kin.read() offset += 1 terminal_idx = node + chck_TERMINATE_CODE if chck[terminal_idx] == chck_TERMINATE_CODE: fn(start, offset, base_id(base[terminal_idx])) if code == chck_TERMINATE_CODE: return idx = node + code node = base[idx] if chck[idx] == code: if node >= 0: continue else: call_if_key_including(kin, node, start, offset, fn) return
class WordDic: __slots__ = ['splitted', 'trie', 'data', 'wd_rd', 'wa_rd', 'indices', 'wi_rd', 'offsets', 'left_ids', 'right_ids', 'costs'] def __init__(self, path, bigendian=False, splitted=False, use_mmap=None): self.splitted = splitted self.trie = Searcher(path + "/word2id", bigendian, use_mmap) if splitted: paths = sorted(glob.glob(path + "/word.dat.*")) self.data = util.get_chararray_multi(paths, bigendian) else: self.wd_rd = DictReader(path + "/word.dat", bigendian, use_mmap) with self.wd_rd as r: self.data = r.get_chararray() self.wa_rd = DictReader(path + "/word.ary.idx", bigendian, use_mmap) with self.wa_rd as r: self.indices = r.get_intarray() self.wi_rd = DictReader(path + "/word.inf", bigendian, use_mmap) with self.wi_rd as r: wc = r.size() // (4 + 2 + 2 + 2) self.offsets = r.get_intarray(wc) """ dataOffsets[単語ID] = 単語の素性データの開始位置 """ self.left_ids = r.get_shortarray(wc) """ leftIds[単語ID] = 単語の左文脈ID """ self.right_ids = r.get_shortarray(wc) """ rightIds[単語ID] = 単語の右文脈ID """ self.costs = r.get_shortarray(wc) """ consts[単語ID] = 単語のコスト """ def release(self): del self.data del self.indices del self.offsets del self.left_ids del self.right_ids del self.costs self.trie.release() del self.trie if not self.splitted: self.wd_rd.release() del self.wd_rd self.wa_rd.release() del self.wa_rd self.wi_rd.release() del self.wi_rd def search(self, text, start, callback): costs = self.costs left_ids = self.left_ids right_ids = self.right_ids indices = self.indices def fn(start, offset, trieId): end = indices[trieId + 1] for i in range(indices[trieId], end): callback(ViterbiNode(i, start, offset, costs[i], left_ids[i], right_ids[i], False)) self.trie.commonprefix_search(text, start, fn) def search_from_trie(self, trie_id, start, length, isspace, callback): costs = self.costs left_ids = self.left_ids right_ids = self.right_ids end = self.indices[trie_id + 1] for i in range(self.indices[trie_id], end): callback(ViterbiNode(i, start, length, costs[i], left_ids[i], right_ids[i], isspace)) def word_data(self, word_id): return tobytes(self.data[self.offsets[word_id]:self.offsets[word_id + 1]])
def __init__(self, path, bigendian=False, use_mmap=None): self.rd = DictReader(path + "/matrix.bin", bigendian, use_mmap) with self.rd as r: self.left_size = r.get_int() right_size = r.get_int() self.matrix = r.get_shortarray(self.left_size * right_size)