Пример #1
0
def main():		
	print "***"
	print "*** Spider Bot v%s" % __version__

	parser = OptionParser(usage=MAIN_USAGE_STR)
	if len(sys.argv) < 3:
		print MAIN_USAGE_STR
		sys.exit(1)		
	parser.add_option('-s', '--seed_dir', dest='seed_dir',
			  help='Simple text seed directory path (default: %default)',
			  default=None)
	parser.add_option('-d', '--dump_dir', dest='dump_dir',
			  help='Output content dump directory path (default: %default)',
			  default=None)
	options, args = parser.parse_args()
	if not args:
		parser.print_help()
		sys.exit(1)

	# Access the options with 'options.seed_dir', for example
	now = time.localtime(time.time())
	print "*** database directory=%s" % sys.argv[1]
	print "*** %s" % time.asctime(now)
	start = time.time()
	socket.setdefaulttimeout(DEFAULT_REQUEST_TIMEOUT)
	
	if sys.argv[ARG_SUB_COMMAND] is not None:
		cmd_key = sys.argv[ARG_SUB_COMMAND].lower()
		if APP_SUB_COMMANDS.has_key(cmd_key):
			bot_method = ARG_METHOD_VALIDATORS.get(cmd_key, None)
			if (bot_method(options, "seed")):
				link_list = runSeedDir(options.seed_dir)
				# The URL Pool contains a collection of the url field data structures
				infoPool = URLInfoPool()
				infoPool.buildURLPool(link_list)
				create_database(sys.argv[:-1], infoPool)
			elif (bot_method(options, "dump")):
				link_list = runSeedDir(options.seed_dir)
				dump_list = crawlForURLContentDump(link_list)
				create_content_db(options.dump_dir, dump_list)
			else:
				# APP_EXIT_POINT (invalid sub command)
				print ERROR_MESSAGES['err_sub']
				print MAIN_USAGE_STR
				print "ERR with subcommand, COMMAND-LINE ARGS:\n<<<%s>>>" % sys.argv
				sys.exit(1)
		else:
			# APP_EXIT_POINT (invalid sub command)
			print ERROR_MESSAGES['err_sub']
			print MAIN_USAGE_STR
			sys.exit(1)

	end = time.time()
	diff = end - start
	print "\n*** Done"
	print "*** spider bot processing time=%s" % diff
Пример #2
0
def main():
    print "***"
    print "*** Spider Bot v%s" % __version__

    parser = OptionParser(usage=MAIN_USAGE_STR)
    if len(sys.argv) < 3:
        print MAIN_USAGE_STR
        sys.exit(1)
    parser.add_option(
        '-s',
        '--seed_dir',
        dest='seed_dir',
        help='Simple text seed directory path (default: %default)',
        default=None)
    parser.add_option(
        '-d',
        '--dump_dir',
        dest='dump_dir',
        help='Output content dump directory path (default: %default)',
        default=None)
    options, args = parser.parse_args()
    if not args:
        parser.print_help()
        sys.exit(1)

    # Access the options with 'options.seed_dir', for example
    now = time.localtime(time.time())
    print "*** database directory=%s" % sys.argv[1]
    print "*** %s" % time.asctime(now)
    start = time.time()
    socket.setdefaulttimeout(DEFAULT_REQUEST_TIMEOUT)

    if sys.argv[ARG_SUB_COMMAND] is not None:
        cmd_key = sys.argv[ARG_SUB_COMMAND].lower()
        if APP_SUB_COMMANDS.has_key(cmd_key):
            bot_method = ARG_METHOD_VALIDATORS.get(cmd_key, None)
            if (bot_method(options, "seed")):
                link_list = runSeedDir(options.seed_dir)
                # The URL Pool contains a collection of the url field data structures
                infoPool = URLInfoPool()
                infoPool.buildURLPool(link_list)
                create_database(sys.argv[:-1], infoPool)
            elif (bot_method(options, "dump")):
                link_list = runSeedDir(options.seed_dir)
                dump_list = crawlForURLContentDump(link_list)
                create_content_db(options.dump_dir, dump_list)
            else:
                # APP_EXIT_POINT (invalid sub command)
                print ERROR_MESSAGES['err_sub']
                print MAIN_USAGE_STR
                print "ERR with subcommand, COMMAND-LINE ARGS:\n<<<%s>>>" % sys.argv
                sys.exit(1)
        else:
            # APP_EXIT_POINT (invalid sub command)
            print ERROR_MESSAGES['err_sub']
            print MAIN_USAGE_STR
            sys.exit(1)

    end = time.time()
    diff = end - start
    print "\n*** Done"
    print "*** spider bot processing time=%s" % diff
Пример #3
0
					   (len(line) > 0) and (len(line.split('::|')) == NO_COLS_SERVICE),
					   link_data)
	content = [ col.split('::|') for col in link_data ]
	return content
	
def runServiceURLPool():
	try:
		data = connectLinkService(URL_LINK_SERVICE)
	except urllib2.URLError, urlerr:
		print "FATAL ERR: could not connect to link seed service"
		print urlerr
		sys.exit(-1)

	link_list = [ line_set[0] for line_set in data ]
	# The URL Pool contains a collection of the url field data structures
	infoPool = URLInfoPool()
	infoPool.buildURLPool(link_list)
	create_database(sys.argv[1], infoPool)

def runSeedDir(seed_dir):
	""" Return all lines in file of type extension tdb in this directory"""
	print "*** Processing seed directory (all files of *.tdb): %s" % seed_dir
	tdb_files = glob.glob('%s/*.tdb' % seed_dir)
	content_lines = []
	for fline in tdb_files:
		fline = open(fline, 'r')
		content = fline.readlines()
		urllines = [line.strip() for line in content]
		for line in urllines:
			content_lines.append(line)
		fline.close()
Пример #4
0
        link_data)
    content = [col.split('::|') for col in link_data]
    return content


def runServiceURLPool():
    try:
        data = connectLinkService(URL_LINK_SERVICE)
    except urllib2.URLError, urlerr:
        print "FATAL ERR: could not connect to link seed service"
        print urlerr
        sys.exit(-1)

    link_list = [line_set[0] for line_set in data]
    # The URL Pool contains a collection of the url field data structures
    infoPool = URLInfoPool()
    infoPool.buildURLPool(link_list)
    create_database(sys.argv[1], infoPool)


def runSeedDir(seed_dir):
    """ Return all lines in file of type extension tdb in this directory"""
    print "*** Processing seed directory (all files of *.tdb): %s" % seed_dir
    tdb_files = glob.glob('%s/*.tdb' % seed_dir)
    content_lines = []
    for fline in tdb_files:
        fline = open(fline, 'r')
        content = fline.readlines()
        urllines = [line.strip() for line in content]
        for line in urllines:
            content_lines.append(line)