def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-a", "--all", action="store_true", dest="all", default=False, help="crawl all available sources (defaults to true if no sources are specified)") parser.add_option("-o", "--offset", default=None, type="int", dest="offset", help="start index of entities to import") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of entities to import") parser.add_option("-r", "--ratio", default=None, type="string", action="store", dest="ratio", help="where this crawler fits in to a distributed stack") parser.add_option("-s", "--sink", default=None, type="string", action="store", dest="sink", help="where to output to (test or mongodb)") parser.add_option("-t", "--test", default=False, action="store_true", dest="test", help="run the crawler with limited input for testing purposes") parser.add_option("-c", "--count", default=False, action="store_true", dest="count", help="print overall entity count from all sources specified and return") parser.add_option("-u", "--update", default=False, action="store_true", dest="update", help="update the existing collection as opposed to dropping it and " + "overwriting any previous contents (the default)") parser.add_option("-g", "--geocode", default=False, action="store_true", dest="geocode", help="Geocode places to ensure all places have a valid lat/lng associated with them.") parser.add_option("-m", "--mount", default=False, action="store_true", dest="mount", help="mount crawler data directory if necessary") parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") #parser.add_option("-d", "--distribute", type="string", # action="callback", callback=parseDistributedHosts, # help="run the crawler distributed across the given set of hosts") (options, args) = parser.parse_args() #if hasattr(Globals.options, 'distributed'): # options.distributed = Globals.options.distributed # options.hosts = Globals.options.hosts #else: # options.distributed = False # options.hosts = [] options.offset = 0 Globals.options = options if len(args) == 0: options.all = True if options.all: options.sources = EntitySources.instantiateAll() else: options.sources = [ ] for arg in args: source = EntitySources.instantiateSource(arg) if source is None: print "Error: unrecognized source '%s'" % arg parser.print_help() sys.exit(1) else: options.sources.append(source) for source in options.sources: source._globals = _globals if options.count or options.ratio: count = 0 for source in options.sources: count += source.getMaxNumEntities() if options.count: print count sys.exit(0) else: options.count = count num, den = options.ratio.split('/') num, den = int(num), int(den) num, den = float(num), float(den) options.offset = int(math.floor((count * (num - 1)) / den)) options.limit = int(math.ceil(count / den) + 1) if options.db: utils.init_db_config(options.db) if options.sink == "test": options.sink = TestEntitySink() elif options.sink == "merge": options.sink = MergeEntitySink() else: from api.MongoStampedAPI import MongoStampedAPI options.sink = MongoStampedAPI(options.db) return options