def assertCMSIdx(self, cms, fname, keys): path = os.path.join(os.path.join(cms.basedir, 'idx'), fname) db = CDBReader(path) r = [] for k in db.iterkeys(): if k[0] == '\x00': (docid, sentid) = struct.unpack('>xll', k) r.append((docid, sentid)) elif k[0] == '\xfd': pass elif k[0] == '\xfe': pass elif k == '\xff': pass else: (c, k) = (k[0], k[1:]) w = k if '\x10' <= c and c <= '\x13': w = unicode(k, 'utf-8') elif c == '\x20': w = u''.join(unichr(0x3000 + ord(c)) for c in k) elif c == '\xf0': if len(k) == 2: w = '%04d' % struct.unpack('>h', k) elif len(k) == 3: w = '%04d/%02d' % struct.unpack('>hb', k) elif len(k) == 4: w = '%04d/%02d/%02d' % struct.unpack('>hbb', k) r.append(w) self.assertEqual(r, keys) return
def __init__(self, path, ext='', codec='utf-8'): self._reader = CDBReader(path) self.ext = ext self.codec = codec return