def main(argv): from utils import getfp args = argv[1:] or ['-'] codec = 'utf-8' for path in args: print >>sys.stderr, path (_,fp) = getfp(path) parser = WikiTextParser() parser.feed_file(fp, codec=codec) parser.close() fp.close() def f(x, i=0): if isinstance(x, WikiTree): print (' '*i+'('+repr(x)) for c in x: f(c, i+1) print (' '*i+')') elif isinstance(x, WikiToken): print (' '*i+repr(x)) elif isinstance(x, XMLTagToken): print (' '*i+repr(x)) elif isinstance(x, basestring): print (' '*i+repr(x)) else: assert 0, x f(parser.get_root()) return
def __init__(self, output=None, pathpat=None, codec="utf-8", titleline=False, mode="page"): assert output is not None or pathpat is not None self.pathpat = pathpat self.codec = codec self.titleline = titleline self.mode = mode self._fp = None if output is not None: (_, self._fp) = getfp(output, mode="w") self._pageid = None self._title = None self._revid = None return
def main(argv): from utils import getfp class TitleExtractor(MWXMLDumpParser): def start_revision(self, pageid, title, revid, timestamp): print (pageid, title) return args = argv[1:] or ['-'] for path in args: print >>sys.stderr, path (_,fp) = getfp(path) parser = TitleExtractor() parser.feed_file(fp) parser.close() fp.close() return 0
def add_data(self, pageid, revid, data): assert self._pageid == pageid assert self._title is not None assert self._revid == revid if self.pathpat is not None: if self._fp is not None: self._fp.close() name = self._title.encode('utf-8').encode('quopri_codec') path = (self.pathpat % {'name':name, 'pageid':pageid}) (_,self._fp) = getfp(path, 'w') assert self._fp is not None if self.titleline: title = self._title.encode(self.codec, 'ignore') self._fp.write(title+'\n') self._fp.write(data.encode(self.codec, 'ignore')) self._fp.write('\n\f') return
def __init__(self, output=None, pathpat=None, codec='utf-8', titleline=False, mode='page'): assert output is not None or pathpat is not None self.pathpat = pathpat self.codec = codec self.titleline = titleline self.mode = mode self._fp = None if output is not None: (_, self._fp) = getfp(output, mode='w') self._pageid = None self._title = None self._revid = None return
def add_data(self, pageid, revid, data): assert self._pageid == pageid assert self._title is not None assert self._revid == revid if self.pathpat is not None: if self._fp is not None: self._fp.close() name = self._title.encode("utf-8").encode("quopri_codec") path = self.pathpat % {"name": name, "pageid": pageid} (_, self._fp) = getfp(path, "w") assert self._fp is not None if self.mode == "page": if self.titleline: title = self._title.encode(self.codec, "ignore") self._fp.write(title + "\n") self._fp.write(data.encode(self.codec, "ignore")) self._fp.write("\n\f") else: self._fp.write(self._title.encode(self.codec, "ignore") + "\t") self._fp.write(data.encode(self.codec, "ignore") + "\n") return
def add_data(self, pageid, revid, data): assert self._pageid == pageid assert self._title is not None assert self._revid == revid if self.pathpat is not None: if self._fp is not None: self._fp.close() name = self._title.encode('utf-8').encode('quopri_codec') path = (self.pathpat % {'name': name, 'pageid': pageid}) (_, self._fp) = getfp(path, 'w') assert self._fp is not None if self.mode == 'page': if self.titleline: title = self._title.encode(self.codec, 'ignore') self._fp.write(title + '\n') self._fp.write(data.encode(self.codec, 'ignore')) self._fp.write('\n\f') else: self._fp.write(self._title.encode(self.codec, 'ignore') + '\t') self._fp.write(data.encode(self.codec, 'ignore') + '\n') return
def main(argv): from utils import getfp class Tokenizer(WikiTextTokenizer): def handle_text(self, pos, text): print(pos, text) return def handle_token(self, pos, token): print(pos, token) return args = argv[1:] or ["-"] codec = "utf-8" for path in args: print >>sys.stderr, path (_, fp) = getfp(path) tokenizer = Tokenizer() tokenizer.feed_file(fp, codec=codec) tokenizer.close() fp.close() return
def main(argv): from utils import getfp class Tokenizer(WikiTextTokenizer): def handle_text(self, pos, text): print(pos, text) return def handle_token(self, pos, token): print(pos, token) return args = argv[1:] or ['-'] codec = 'utf-8' for path in args: print >> sys.stderr, path (_, fp) = getfp(path) tokenizer = Tokenizer() tokenizer.feed_file(fp, codec=codec) tokenizer.close() fp.close() return