def assertCMSIdx(self, cms, fname, keys): path = os.path.join(os.path.join(cms.basedir, 'idx'), fname) db = CDBReader(path) r = [] for k in db.iterkeys(): if k[0] == '\x00': (docid,sentid) = struct.unpack('>xll', k) r.append((docid, sentid)) elif k[0] == '\xfd': pass elif k[0] == '\xfe': pass elif k == '\xff': pass else: (c,k) = (k[0], k[1:]) w = k if '\x10' <= c and c <= '\x13': w = unicode(k, 'utf-8') elif c == '\x20': w = u''.join( unichr(0x3000+ord(c)) for c in k ) elif c == '\xf0': if len(k) == 2: w = '%04d' % struct.unpack('>h', k) elif len(k) == 3: w = '%04d/%02d' % struct.unpack('>hb', k) elif len(k) == 4: w = '%04d/%02d/%02d' % struct.unpack('>hbb', k) r.append(w) self.assertEqual(r, keys) return
def assertCMSIdx(self, cms, fname, keys): path = os.path.join(os.path.join(cms.basedir, 'idx'), fname) db = CDBReader(path) r = [] for k in db.iterkeys(): if k[0] == '\x00': (docid, sentid) = struct.unpack('>xll', k) r.append((docid, sentid)) elif k[0] == '\xfd': pass elif k[0] == '\xfe': pass elif k == '\xff': pass else: (c, k) = (k[0], k[1:]) w = k if '\x10' <= c and c <= '\x13': w = unicode(k, 'utf-8') elif c == '\x20': w = u''.join(unichr(0x3000 + ord(c)) for c in k) elif c == '\xf0': if len(k) == 2: w = '%04d' % struct.unpack('>h', k) elif len(k) == 3: w = '%04d/%02d' % struct.unpack('>hb', k) elif len(k) == 4: w = '%04d/%02d/%02d' % struct.unpack('>hbb', k) r.append(w) self.assertEqual(r, keys) return
class WikiDBReader(object): def __init__(self, path, ext='', codec='utf-8'): self._reader = CDBReader(path) self.ext = ext self.codec = codec return def __iter__(self): return self.get_pageids() def __getitem__(self, pageid): return self.get_page(pageid) def _get_data(self, key): data = self._reader[key] data = decompress(key, data) return data.decode(self.codec, 'ignore') def get_pageids(self): for key in self._reader.iterkeys(): if key.endswith(':title'): (pageid,_,_) = key.partition(':') yield int(pageid) return def get_page(self, pageid): key = ('%s:title' % pageid) title = self._reader[key].decode(self.codec, 'ignore') key = ('%s:revs' % pageid) revids = self._reader[key].split(' ') return (title, revids) def get_wiki(self, pageid, revid): key = '%s/%s:wiki' % (pageid, revid) key += self.ext return self._get_data(key) def get_text(self, pageid, revid): key = '%s/%s:text' % (pageid, revid) key += self.ext return self._get_data(key)
class WikiDBReader(object): def __init__(self, path, ext="", codec="utf-8"): self._reader = CDBReader(path) self.ext = ext self.codec = codec return def __iter__(self): return self.get_pageids() def __getitem__(self, pageid): return self.get_page(pageid) def _get_data(self, key): data = self._reader[key] data = decompress(key, data) return data.decode(self.codec, "ignore") def get_pageids(self): for key in self._reader.iterkeys(): if key.endswith(":title"): (pageid, _, _) = key.partition(":") yield int(pageid) return def get_page(self, pageid): key = "%s:title" % pageid title = self._reader[key].decode(self.codec, "ignore") key = "%s:revs" % pageid revids = self._reader[key].split(" ") return (title, revids) def get_wiki(self, pageid, revid): key = "%s/%s:wiki" % (pageid, revid) key += self.ext return self._get_data(key) def get_text(self, pageid, revid): key = "%s/%s:text" % (pageid, revid) key += self.ext return self._get_data(key)
class WikiDBReader(object): def __init__(self, path, ext='', codec='utf-8'): self._reader = CDBReader(path) self.ext = ext self.codec = codec return def __iter__(self): return self.get_pageids() def __getitem__(self, pageid): return self.get_page(pageid) def _get_data(self, key): data = self._reader[key] data = decompress(key, data) return data.decode(self.codec, 'ignore') def get_pageids(self): for key in self._reader.iterkeys(): if key.endswith(':title'): (pageid, _, _) = key.partition(':') yield int(pageid) return def get_page(self, pageid): key = ('%s:title' % pageid) title = self._reader[key].decode(self.codec, 'ignore') key = ('%s:revs' % pageid) revids = self._reader[key].split(' ') return (title, revids) def get_wiki(self, pageid, revid): key = '%s/%s:wiki' % (pageid, revid) key += self.ext return self._get_data(key) def get_text(self, pageid, revid): key = '%s/%s:text' % (pageid, revid) key += self.ext return self._get_data(key)
def __init__(self, path, ext="", codec="utf-8"): self._reader = CDBReader(path) self.ext = ext self.codec = codec return
def __init__(self, path, ext='', codec='utf-8'): self._reader = CDBReader(path) self.ext = ext self.codec = codec return