def main(argv): import getopt def usage(): print ('usage: %s [-L] [-d] [-o output] [-P pathpat] [-c codec] [-T] [-Z] ' '[file ...]') % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'Ldo:P:c:TZ') except getopt.GetoptError: return usage() args = args or ['-'] errfp = None output = '-' codec = 'utf-8' ext = '' pathpat = None titleline = False klass = WikiTextExtractor for (k, v) in opts: if k == '-d': errfp = sys.stderr elif k == '-o': output = v elif k == '-P': pathpat = v elif k == '-c': codec = v elif k == '-T': titleline = True elif k == '-Z': ext = '.gz' elif k == '-L': klass = WikiLinkExtractor if output.endswith('.cdb'): writer = WikiDBWriter(output, codec=codec, ext=ext) else: writer = WikiFileWriter( output=output, pathpat=pathpat, codec=codec, titleline=titleline) try: converter = Converter(writer, klass, errfp=errfp) for path in args: if path.endswith('.cdb'): reader = WikiDBReader(path, codec=codec, ext=ext) for pageid in reader: (title, revids) = reader[pageid] converter.add_page(pageid, title) for revid in revids: wiki = reader.get_wiki(pageid, revid) converter.add_revid(pageid, revid) converter.feed_text(pageid, revid, wiki) else: (path,fp) = getfp(path) if path.endswith('.xml'): parser = MWDump2Text(converter) parser.feed_file(fp) parser.close() else: converter.add_page(0, path) converter.add_revid(0, 0) converter.feed_file(0, 0, fp, codec=codec) fp.close() converter.close() finally: writer.close() return
def main(argv): args = argv[1:] or ['-'] for path in args: print >>sys.stderr, path (_,fp) = getfp(path) parser = WikiAgeExtractor(time.time()) parser.feed_file(fp) parser.close() fp.close() return
def main(argv): args = argv[1:] or ['-'] for path in args: print >> sys.stderr, path (_, fp) = getfp(path) parser = WikiAgeExtractor(time.time()) parser.feed_file(fp) parser.close() fp.close() return
def main(argv): import getopt def usage(): print("usage: %s {-w} [-c codec] [-o output] [-T] [-Z] " "cdbfile [pageid ...]" % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], "wo:c:TZ") except getopt.GetoptError: return usage() text = True output = "-" codec = "utf-8" ext = "" titleline = False for (k, v) in opts: if k == "-o": output = v elif k == "-c": codec = v elif k == "-w": text = False elif k == "-T": titleline = True elif k == "-Z": ext = ".gz" if not args: return usage() (_, outfp) = getfp(output, "w") readers = [] pageids = [] for arg in args: if os.path.isfile(arg): readers.append(WikiDBReader(arg, codec=codec, ext=ext)) else: pageids.append(arg) for reader in readers: for pageid in pageids or iter(reader): try: (title, revids) = reader[pageid] except KeyError: continue if titleline: outfp.write(title.encode(codec, "ignore") + "\n") for revid in revids: try: if text: data = reader.get_text(pageid, revid) else: data = reader.get_wiki(pageid, revid) except KeyError: continue outfp.write(data.encode(codec, "ignore")) return
def main(argv): import getopt def usage(): print('usage: %s {-w} [-c codec] [-o output] [-T] [-Z] ' 'cdbfile [pageid ...]' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'wo:c:TZ') except getopt.GetoptError: return usage() text = True output = '-' codec = 'utf-8' ext = '' titleline = False for (k, v) in opts: if k == '-o': output = v elif k == '-c': codec = v elif k == '-w': text = False elif k == '-T': titleline = True elif k == '-Z': ext = '.gz' if not args: return usage() (_, outfp) = getfp(output, 'w') readers = [] pageids = [] for arg in args: if os.path.isfile(arg): readers.append(WikiDBReader(arg, codec=codec, ext=ext)) else: pageids.append(arg) for reader in readers: for pageid in (pageids or iter(reader)): try: (title, revids) = reader[pageid] except KeyError: continue if titleline: outfp.write(title.encode(codec, 'ignore') + '\n') for revid in revids: try: if text: data = reader.get_text(pageid, revid) else: data = reader.get_wiki(pageid, revid) except KeyError: continue outfp.write(data.encode(codec, 'ignore')) return
def main(argv): import getopt def usage(): print ('usage: %s [-o output] [-P pathpat] [-c codec] [-T] [-Z] ' '[file ...]' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'o:P:c:TZ') except getopt.GetoptError: return usage() args = args or ['-'] output = '-' codec = 'utf-8' ext = '' pathpat = None titleline = False for (k, v) in opts: if k == '-o': output = v elif k == '-P': pathpat = v elif k == '-c': codec = v elif k == '-T': titleline = True elif k == '-Z': ext = '.gz' if output.endswith('.cdb'): writer = WikiDBWriter(output, codec=codec, ext=ext) else: writer = WikiFileWriter( output=output, pathpat=pathpat, codec=codec, titleline=titleline) parser = MWXMLDump2DB(writer) for path in args: (_,fp) = getfp(path) try: parser.feed_file(fp) finally: fp.close() parser.close() return
def main(argv): import getopt def usage(): print( 'usage: %s [-L|-C] [-d] [-o output] [-P pathpat] [-c codec] [-T] [-Z] ' '[file ...]') % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'LCdo:P:c:m:TZ') except getopt.GetoptError: return usage() args = args or ['-'] errfp = None output = '-' codec = 'utf-8' ext = '' pathpat = None mode = 'page' titleline = False klass = WikiTextExtractor for (k, v) in opts: if k == '-d': errfp = sys.stderr elif k == '-o': output = v elif k == '-P': pathpat = v elif k == '-c': codec = v elif k == '-m': mode = v elif k == '-T': titleline = True elif k == '-Z': ext = '.gz' elif k == '-L': klass = WikiLinkExtractor elif k == '-C': klass = WikiCategoryExtractor if output.endswith('.cdb'): writer = WikiDBWriter(output, codec=codec, ext=ext) else: writer = WikiFileWriter(output=output, pathpat=pathpat, codec=codec, titleline=titleline, mode=mode) try: converter = Converter(writer, klass, errfp=errfp) for path in args: if path.endswith('.cdb'): reader = WikiDBReader(path, codec=codec, ext=ext) for pageid in reader: (title, revids) = reader[pageid] converter.add_page(pageid, title) for revid in revids: wiki = reader.get_wiki(pageid, revid) converter.add_revid(pageid, revid) converter.feed_text(pageid, revid, wiki) else: (path, fp) = getfp(path) if path.endswith('.xml'): parser = MWDump2Text(converter) parser.feed_file(fp) parser.close() else: converter.add_page(0, path) converter.add_revid(0, 0) converter.feed_file(0, 0, fp, codec=codec) fp.close() converter.close() finally: writer.close() return