class MWXMLDump2CDB(MWXMLDumpFilter): def __init__(self, path): MWXMLDumpFilter.__init__(self) self._maker = CDBMaker(path) self._key = self._value = None return def close(self): MWXMLDumpFilter.close(self) self._maker.finish() return def start_page(self, pageid, title): MWXMLDumpFilter.start_page(self, pageid, title) self._maker.add('%s:title' % pageid, title.encode('utf-8')) self._revs = [] return def start_revision(self, pageid, title, revid, timestamp): MWXMLDumpFilter.start_revision(self, pageid, title, revid, timestamp) self._revs.append(revid) return def end_page(self, pageid, title): MWXMLDumpFilter.end_page(self, pageid, title) revs = ' '.join( str(revid) for revid in self._revs ) self._maker.add('%s:revs' % pageid, revs) return def open_file(self, pageid, title, revid, timestamp): print >>sys.stderr, (pageid, title, revid) self._key = '%s/%s:text' % (pageid, revid) self._value = StringIO() return GzipFile(mode='w', fileobj=self._value) def close_file(self, fp): fp.close() self._maker.add(self._key, self._value.getvalue()) self._key = self._value = None return def write_file(self, fp, text): fp.write(text.encode('utf-8')) return
class MWCDB2Text(object): def __init__(self, srcpath, dstpath, factory): self.reader = CDBReader(srcpath) self.writer = CDBMaker(dstpath) self.factory = factory return def close(self): self.writer.finish() return def convert(self, pageid, revision=0): key = '%d/%d' % (pageid, revision) srcbuf = StringIO(self.reader[key]) src = GzipFile(mode='r', fileobj=srcbuf) dstbuf = StringIO() dst = GzipFile(mode='w', fileobj=dstbuf) textparser = self.factory('utf-8') textparser.feed_text(src.read().decode('utf-8')) textparser.close() textparser.convert(dst) src.close() dst.close() self.writer.add(key, dstbuf.getvalue()) key = '%d:title' % pageid self.writer.add(key, self.reader[key]) return def convert_all(self): for key in self.reader: try: i = key.rindex('/') pageid = int(key[:i]) revision = int(key[i+1:]) except ValueError: continue print >>sys.stderr, (pageid,revision) self.convert(pageid, revision) return
def __init__(self, srcpath, dstpath, factory): self.reader = CDBReader(srcpath) self.writer = CDBMaker(dstpath) self.factory = factory return
def __init__(self, path): MWXMLDumpFilter.__init__(self) self._maker = CDBMaker(path) self._key = self._value = None return