def main(): logging.basicConfig(level=logging.DEBUG) if len(sys.argv) < 2: logger.error("Must specify a command! Try 'help'") sys.exit(-1) command = sys.argv[1] if command == 'help': # Display a help message logger.info("The following commands are supported:") logger.info(" index : builds indexes - warning: can take many hours") logger.info(" stats : print statistics in a tab-delimited CSV format") logger.info(" categories [lang] : list categories for 'lang', and the number of documents in each") elif command == 'list': # List available dumps dumps = find_dumps() # TODO: Print if an index exists for key in sorted(dumps): print " %-20s%s" % (key, dumps[key]) elif command == 'index': # Build indices load_dumps(build_index=True) elif command == 'stats': root_logger = logging.getLogger() root_logger.level = logging.ERROR # Display statistics paths = find_dumps() sizes = dict((k, len(Dump(p))) for k, p in paths.iteritems()) fields = ['lang', 'filename', 'pages', 'categories'] outfile = csv.DictWriter(sys.stdout, fields) for p in sorted(sizes, key=sizes.get, reverse=True): dump = Dump(paths[p]) d = dict\ ( lang=p , filename=os.path.basename(paths[p]) , pages=sizes[p] , categories=len(dump.categories) ) outfile.writerow(d) elif command == 'categories': # Dump category distribution parser = optparse.OptionParser() parser.add_option("-l", "--language", dest="lang", help="Relevant language prefix") options, args = parser.parse_args(sys.argv[2:]) dump = load_dumps([options.lang], build_index=True)[options.lang] cats = dump.categories for c in sorted(cats, key=lambda x:len(cats[x]), reverse=True): print "%-4d %s" % (len(cats[c]), c) else: logging.error("Unknown command: %s", command) logging.info("Try the 'help' command.")
def main(): logging.basicConfig(level=logging.DEBUG) if len(sys.argv) < 2: logger.error("Must specify language prefix") sys.exit(-1) # Dump category distribution parser = optparse.OptionParser() parser.add_option("-l", "--language", dest="lang", help="Relevant language prefix") parser.add_option("-o", "--output", dest="output", help="Output format: csv or yaml") options, args = parser.parse_args(sys.argv[1:]) dump = load_dumps([options.lang], build_index=True)[options.lang] cats = dump.categories get_page = dump.get_page_by_index cat_spacename = category_identifier[options.lang] if options.output == 'csv': writer = csv.writer(sys.stdout,quoting=csv.QUOTE_ALL) for c in sorted(cats, key=lambda x:len(cats[x]), reverse=True): for a in cats[c]: title = get_page(a).title if title.find(cat_spacename) == -1: writer.writerow((c, title)) else: for c in sorted(cats, key=lambda x:len(cats[x]), reverse=True): print "%s:" % (c) print "# %d articles" % (len(cats[c])) for a in cats[c]: title = get_page(a).title if title.find(cat_spacename) == -1: print "- %s" %( get_page(a).title )