Exemplo n.º 1
0
    def run(self, pool_size):
        try:
            uri_list = open('./page_lists/' + self.pages_file_name, 'r')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % self.pages_file_name)
            exit()
        sql_driver = MySQLDriver(self.db_name)

        # sort out what uris we are processing from the list
        uris_to_process = []

        count = 0

        print('\t------------------------')
        print('\t Building List of Pages ')
        print('\t------------------------')

        for uri in uri_list:
            # skip lines that are comments
            if "#" in uri[0]: continue

            count += 1

            # drop trailing '/, clean off white space, make lower, create cli-safe uri
            # with parse.quote, but exclude :/ b/c of http://
            uri = re.sub('/$', '',
                         urllib.parse.quote(uri.strip(), safe=":/").lower())

            # if it is a m$ office or other doc, skip
            if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri):
                print("\t\t%s | %-50s Not an HTML document, Skipping." %
                      (count, uri[:50]))
                continue

            # skip if in db already
            if sql_driver.page_exists(uri):
                print("\t\t%s | %-50s Exists in DB, Skipping." %
                      (count, uri[:50]))
                continue

            # only add if not in list already
            if uri not in uris_to_process:
                print("\t\t%s | %-50s Adding." % (count, uri[:50]))
                uris_to_process.append(uri)
            else:
                print("\t\t%s | %-50s Already queued, Skipping." %
                      (count, uri[:50]))

        print('\t----------------------------------')
        print('\t%s pages will now be webXray\'d' % len(uris_to_process))
        print('\t\t...you can go take a walk. ;-)')
        print('\t----------------------------------')

        myPool = Pool(pool_size)
        myPool.map(self.process_uri, uris_to_process)
Exemplo n.º 2
0
	def run(self, pool_size):
		try:
			uri_list = open('./page_lists/'+self.pages_file_name, 'r')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()
		sql_driver = MySQLDriver(self.db_name)

		# sort out what uris we are processing from the list
		uris_to_process = []

		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for uri in uri_list:
			# skip lines that are comments
			if "#" in uri[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', uri)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50]))
				continue
		
			# drop trailing '/, clean off white space, make lower, create cli-safe uri
			# with parse.quote, but exclude :/ b/c of http://
			uri = re.sub('/$', '', urllib.parse.quote(uri.strip(), safe=":/").lower())

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50]))
				continue

			# skip if in db already
			if sql_driver.page_exists(uri):
				print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50]))
				continue
	
			# only add if not in list already
			if uri not in uris_to_process:
				print("\t\t%s | %-50s Adding." % (count, uri[:50]))
				uris_to_process.append(uri)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50]))

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(uris_to_process))
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		myPool = Pool(pool_size)
		myPool.map(self.process_uri, uris_to_process)
Exemplo n.º 3
0
	def run(self, pool_size):
		"""
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there
		"""

		# the list of url MUST be in the page_lists directory!
		try:
			url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r', encoding='utf-8')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()

		# set up sql connection used to determine if items are already in the db
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# this list gets mapped to the Pool, very important!
		urls_to_process = set()

		# simple counter used solely for updates to CLI
		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for url in url_list:
			# skip lines that are comments
			if "#" in url[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', url)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50]))
				continue

			# non-ascii domains will crash phantomjs, so we need to convert them to 
			# 	idna/ascii/utf-8
			# this requires splitting apart the url, converting the domain to idna,
			#	and pasting it all back together
			
			split_url = urlsplit(url.strip())
			idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
			url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment))

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50]))
				continue

			# skip if in db already unless we are doing a timeseries
			if self.allow_timeseries == False:
				if sql_driver.page_exists(url):
					print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50]))
					continue
	
			# only add if not in list already
			if url not in urls_to_process:
				print("\t\t%s | %-50s Adding." % (count, url[:50]))
				urls_to_process.add(url)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50]))

		# close the db connection
		sql_driver.close()

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(urls_to_process))
		print('\t\tBrowser(s) are %s' % self.browser_types)
		print('\t\tBrowser wait time is %s seconds' % self.browser_wait)
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		# for macOS (darwin) we must specify start method as 'forkserver'
		#	this is essentially voodoo to ward off evil spirits which 
		#	appear when large pool sizes are used on macOS
		# get_start_method must be set to 'allow_none', otherwise upon
		#	checking the method it gets set (!) - and if we then get/set again
		#	we get an error
		if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver':
			multiprocessing.set_start_method('forkserver')
		myPool = multiprocessing.Pool(pool_size)
		myPool.map(self.process_url, urls_to_process)

		# FYI
		self.print_runtime()
Exemplo n.º 4
0
	def run(self, pool_size):
		"""
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there
		"""

		# the list of url MUST be in the page_lists directory!
		try:
			url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()

		# set up sql connection used to determine if items are already in the db
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# this list gets mapped to the Pool, very important!
		urls_to_process = set()

		# simple counter used solely for updates to CLI
		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for url in url_list:
			# skip lines that are comments
			if "#" in url[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', url)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50]))
				continue

			# non-ascii domains will crash phantomjs, so we need to convert them to 
			# 	idna/ascii/utf-8
			# this requires splitting apart the url, converting the domain to idna,
			#	and pasting it all back together
			
			split_url = urlsplit(url.strip())
			idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
			url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment))

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50]))
				continue

			# skip if in db already unless we are doing a timeseries
			if self.allow_timeseries == False:
				if sql_driver.page_exists(url):
					print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50]))
					continue
	
			# only add if not in list already
			if url not in urls_to_process:
				print("\t\t%s | %-50s Adding." % (count, url[:50]))
				urls_to_process.add(url)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50]))

		# close the db connection
		sql_driver.close()

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(urls_to_process))
		print('\t\tBrowser(s) are %s' % self.browser_types)
		print('\t\tBrowser wait time is %s seconds' % self.browser_wait)
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		# for macOS (darwin) we must specify start method as 'forkserver'
		#	this is essentially voodoo to ward off evil spirits which 
		#	appear when large pool sizes are used on macOS
		# get_start_method must be set to 'allow_none', otherwise upon
		#	checking the method it gets set (!) - and if we then get/set again
		#	we get an error
		if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver':
			multiprocessing.set_start_method('forkserver')
		myPool = multiprocessing.Pool(pool_size)
		myPool.map(self.process_url, urls_to_process)

		# FYI
		self.print_runtime()
Exemplo n.º 5
0
	def run(self, pool_size):
		try:
			uri_list = open('./page_lists/'+self.pages_file_name, 'r')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()
		sql_driver = MySQLDriver(self.db_name)

		# sort out what uris we are processing from the list
		uris_to_process = []

		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for uri in uri_list:
			# skip lines that are comments
			if "#" in uri[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', uri)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50]))
				continue

			# non-ascii domains will crash phantomjs, so we need to convert them to 
			# 	idna/ascii/utf-8
			# this requires splitting apart the uri, converting the domain to idna,
			#	and pasting it all back together. ugly.
			
			parsed_uri = urlsplit(uri.strip())
			uri = parsed_uri[0] + "://"
			uri += parsed_uri[1].encode('idna').decode('utf-8')
			
			# if chunks exist glue them back together
			
			if len(parsed_uri[2]) != 0:
				uri += parsed_uri[2]
			if len(parsed_uri[3]) != 0:
				uri += '?' + parsed_uri[3]
			if len(parsed_uri[4]) != 0:
				uri += '#' + parsed_uri[4]

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50]))
				continue

			# skip if in db already
			if sql_driver.page_exists(uri):
				print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50]))
				continue
	
			# only add if not in list already
			if uri not in uris_to_process:
				print("\t\t%s | %-50s Adding." % (count, uri[:50]))
				uris_to_process.append(uri)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50]))

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(uris_to_process))
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		myPool = Pool(pool_size)
		myPool.map(self.process_uri, uris_to_process)