def main(): print "***" print "*** Spider Bot v%s" % __version__ parser = OptionParser(usage=MAIN_USAGE_STR) if len(sys.argv) < 3: print MAIN_USAGE_STR sys.exit(1) parser.add_option('-s', '--seed_dir', dest='seed_dir', help='Simple text seed directory path (default: %default)', default=None) parser.add_option('-d', '--dump_dir', dest='dump_dir', help='Output content dump directory path (default: %default)', default=None) options, args = parser.parse_args() if not args: parser.print_help() sys.exit(1) # Access the options with 'options.seed_dir', for example now = time.localtime(time.time()) print "*** database directory=%s" % sys.argv[1] print "*** %s" % time.asctime(now) start = time.time() socket.setdefaulttimeout(DEFAULT_REQUEST_TIMEOUT) if sys.argv[ARG_SUB_COMMAND] is not None: cmd_key = sys.argv[ARG_SUB_COMMAND].lower() if APP_SUB_COMMANDS.has_key(cmd_key): bot_method = ARG_METHOD_VALIDATORS.get(cmd_key, None) if (bot_method(options, "seed")): link_list = runSeedDir(options.seed_dir) # The URL Pool contains a collection of the url field data structures infoPool = URLInfoPool() infoPool.buildURLPool(link_list) create_database(sys.argv[:-1], infoPool) elif (bot_method(options, "dump")): link_list = runSeedDir(options.seed_dir) dump_list = crawlForURLContentDump(link_list) create_content_db(options.dump_dir, dump_list) else: # APP_EXIT_POINT (invalid sub command) print ERROR_MESSAGES['err_sub'] print MAIN_USAGE_STR print "ERR with subcommand, COMMAND-LINE ARGS:\n<<<%s>>>" % sys.argv sys.exit(1) else: # APP_EXIT_POINT (invalid sub command) print ERROR_MESSAGES['err_sub'] print MAIN_USAGE_STR sys.exit(1) end = time.time() diff = end - start print "\n*** Done" print "*** spider bot processing time=%s" % diff
def main(): print "***" print "*** Spider Bot v%s" % __version__ parser = OptionParser(usage=MAIN_USAGE_STR) if len(sys.argv) < 3: print MAIN_USAGE_STR sys.exit(1) parser.add_option( '-s', '--seed_dir', dest='seed_dir', help='Simple text seed directory path (default: %default)', default=None) parser.add_option( '-d', '--dump_dir', dest='dump_dir', help='Output content dump directory path (default: %default)', default=None) options, args = parser.parse_args() if not args: parser.print_help() sys.exit(1) # Access the options with 'options.seed_dir', for example now = time.localtime(time.time()) print "*** database directory=%s" % sys.argv[1] print "*** %s" % time.asctime(now) start = time.time() socket.setdefaulttimeout(DEFAULT_REQUEST_TIMEOUT) if sys.argv[ARG_SUB_COMMAND] is not None: cmd_key = sys.argv[ARG_SUB_COMMAND].lower() if APP_SUB_COMMANDS.has_key(cmd_key): bot_method = ARG_METHOD_VALIDATORS.get(cmd_key, None) if (bot_method(options, "seed")): link_list = runSeedDir(options.seed_dir) # The URL Pool contains a collection of the url field data structures infoPool = URLInfoPool() infoPool.buildURLPool(link_list) create_database(sys.argv[:-1], infoPool) elif (bot_method(options, "dump")): link_list = runSeedDir(options.seed_dir) dump_list = crawlForURLContentDump(link_list) create_content_db(options.dump_dir, dump_list) else: # APP_EXIT_POINT (invalid sub command) print ERROR_MESSAGES['err_sub'] print MAIN_USAGE_STR print "ERR with subcommand, COMMAND-LINE ARGS:\n<<<%s>>>" % sys.argv sys.exit(1) else: # APP_EXIT_POINT (invalid sub command) print ERROR_MESSAGES['err_sub'] print MAIN_USAGE_STR sys.exit(1) end = time.time() diff = end - start print "\n*** Done" print "*** spider bot processing time=%s" % diff
content = [ col.split('::|') for col in link_data ] return content def runServiceURLPool(): try: data = connectLinkService(URL_LINK_SERVICE) except urllib2.URLError, urlerr: print "FATAL ERR: could not connect to link seed service" print urlerr sys.exit(-1) link_list = [ line_set[0] for line_set in data ] # The URL Pool contains a collection of the url field data structures infoPool = URLInfoPool() infoPool.buildURLPool(link_list) create_database(sys.argv[1], infoPool) def runSeedDir(seed_dir): """ Return all lines in file of type extension tdb in this directory""" print "*** Processing seed directory (all files of *.tdb): %s" % seed_dir tdb_files = glob.glob('%s/*.tdb' % seed_dir) content_lines = [] for fline in tdb_files: fline = open(fline, 'r') content = fline.readlines() urllines = [line.strip() for line in content] for line in urllines: content_lines.append(line) fline.close() return content_lines
return content def runServiceURLPool(): try: data = connectLinkService(URL_LINK_SERVICE) except urllib2.URLError, urlerr: print "FATAL ERR: could not connect to link seed service" print urlerr sys.exit(-1) link_list = [line_set[0] for line_set in data] # The URL Pool contains a collection of the url field data structures infoPool = URLInfoPool() infoPool.buildURLPool(link_list) create_database(sys.argv[1], infoPool) def runSeedDir(seed_dir): """ Return all lines in file of type extension tdb in this directory""" print "*** Processing seed directory (all files of *.tdb): %s" % seed_dir tdb_files = glob.glob('%s/*.tdb' % seed_dir) content_lines = [] for fline in tdb_files: fline = open(fline, 'r') content = fline.readlines() urllines = [line.strip() for line in content] for line in urllines: content_lines.append(line) fline.close() return content_lines