예제 #1
0
    def execute(self, url, browser_wait):
        """
		Main function, loads page and analyzes results.
		"""

        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print('Single Site Test On: %s' % url)
        print('\tBrowser type is %s' % self.browser_type)
        print('\tBrowser wait time is %s seconds' % browser_wait)

        # make sure it is an http(s) address
        if not re.match('^https?://', url):
            print('\tNot a valid url, aborting')
            return None

        # import and set up specified browser driver
        if self.browser_type == 'phantomjs':
            browser_driver = PhantomDriver()
        elif self.browser_type == 'chrome':
            browser_driver = ChromeDriver()
            chrome_ua = browser_driver.get_ua_for_headless()
            browser_driver = ChromeDriver(ua=chrome_ua)

        # attempt to get the page
        browser_output = browser_driver.get_webxray_scan_data(
            url, browser_wait)

        # if there was a problem we print the error
        if browser_output['success'] == False:
            print('\t\t%-50s Browser Error: %s' %
                  (url[:50], browser_output['result']))
            return
        else:
            browser_output = browser_output['result']

        # get the ip, fqdn, domain, pubsuffix, and tld from the URL
        # we need the domain to figure out if cookies/elements are third-party
        origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
            url)

        # if we can't get page domain info we bail out
        if origin_ip_fqdn_domain_pubsuffix_tld is None:
            print('could not parse origin domain')
            return None

        origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0]
        origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1]
        origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2]
        origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3]
        origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4]

        print('\n\t------------------{ URL }------------------')
        print('\t' + url)
        print('\n\t------------------{ Final URL }------------------')
        print('\t' + browser_output['final_url'])
        print('\n\t------------------{ Domain }------------------')
        print('\t' + origin_domain)
        print(
            '\n\t------------------{ Seconds to Complete Download }------------------'
        )
        print('\t%s' % (browser_output['load_time'] / 1000))
        print('\n\t------------------{ 3rd Party Cookies }------------------')
        cookie_list = []
        for cookie in browser_output['cookies']:
            # get domain, pubsuffix, and tld from cookie
            # we have to append http b/c the parser will fail, this is a lame hack, should fix
            cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                'http://' + cookie['domain'])

            # something went wrong, but we continue to go process the elements
            if cookie_ip_fqdn_domain_pubsuffix_tld is None:
                print('could not parse cookie')
                continue

            # otherwise, everything went fine
            cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0]
            cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1]
            cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2]
            cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3]
            cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4]

            # print external cookies
            if origin_domain not in cookie_domain:
                cookie_list.append(
                    re.sub('^\.', '', cookie['domain']) + ' -> ' +
                    cookie['name'])

        cookie_list.sort()
        count = 0
        for cookie in cookie_list:
            count += 1
            print('\t%s) %s' % (count, cookie))

        print(
            '\n\t------------------{ 3p Domains Requested }------------------')
        element_domains = []

        for request in browser_output['processed_requests']:
            # if the request starts with 'data'/etc we can't parse tld anyway, so skip
            if re.match('^(data|about|chrome).+', request):
                continue

            element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                request)

            # problem with this request, bail on it and do the next
            if element_ip_fqdn_domain_pubsuffix_tld is None:
                continue

            element_ip = element_ip_fqdn_domain_pubsuffix_tld[0]
            element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1]
            element_domain = element_ip_fqdn_domain_pubsuffix_tld[2]
            element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3]
            element_tld = element_ip_fqdn_domain_pubsuffix_tld[4]

            if origin_domain not in element_domain:
                if element_domain not in element_domains:
                    element_domains.append(element_domain)

        element_domains.sort()

        count = 0
        for domain in element_domains:
            count += 1
            if domain in self.domain_owners:
                lineage = ''
                for item in self.get_lineage(self.domain_owners[domain]):
                    lineage += self.id_to_owner[item] + ' > '
                print('\t%s) %s [%s]' % (count, domain, lineage[:-3]))
            else:
                print('\t%s) %s [Unknown Owner]' % (count, domain))
예제 #2
0
	def process_url(self, url):
		"""
		this function takes a specified url, loads it in the browser (currently phantomjs)
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

		# set up sql connection used to log errors and do timeseries checks
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# output store does the heavy lifting of analyzing browser output and storing to db
		output_store = OutputStore(self.db_engine, self.db_name)

		# support for loading same page with multiple browsers - purposefully undocumented 
		for browser_type in self.browser_types:

			# import and set up specified browser driver
			# 	note we need to set up a new browser each time to 
			#	get a fresh profile
			if browser_type == 'phantomjs':
				browser_driver 	= PhantomDriver()
			elif browser_type == 'chrome':
				browser_driver 	= ChromeDriver(ua=self.chrome_ua)

			# support for timeseries collections - purposefully undocumented 
			if self.allow_timeseries:
				page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type)
				if page_last_accessed_browser_type:
					time_diff = datetime.now()-page_last_accessed_browser_type[0]
					if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type:
						print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type))
						continue

			# attempt to load the page, fail gracefully
			try:
				browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait)
			except:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return		
			
			# if there was a problem browser_output will be None
			if browser_output == None:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return

			# attempt to store the output
			if output_store.store(url, browser_output):
				print('\t\t%-50s Success with %s' % (url[:50],browser_type))
			else:
				print('\t\t%-50s Fail with %s' % (url[:50],browser_type))
				sql_driver.log_error(url, 'Unable to load page')

		sql_driver.close()
		return
예제 #3
0
파일: Collector.py 프로젝트: timlib/webXray
	def process_url(self, url):
		"""
		this function takes a specified url, loads it in the browser (currently phantomjs)
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

		# set up sql connection used to log errors and do timeseries checks
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# output store does the heavy lifting of analyzing browser output and storing to db
		output_store = OutputStore(self.db_engine, self.db_name)

		# support for loading same page with multiple browsers - purposefully undocumented 
		for browser_type in self.browser_types:

			# import and set up specified browser driver
			# 	note we need to set up a new browser each time to 
			#	get a fresh profile
			if browser_type == 'phantomjs':
				browser_driver 	= PhantomDriver()
			elif browser_type == 'chrome':
				browser_driver 	= ChromeDriver(ua=self.chrome_ua)

			# support for timeseries collections - purposefully undocumented 
			if self.allow_timeseries:
				page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type)
				if page_last_accessed_browser_type:
					time_diff = datetime.now()-page_last_accessed_browser_type[0]
					if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type:
						print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type))
						continue

			# attempt to load the page, fail gracefully
			try:
				browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait)
			except:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return
			
			# if there was a problem we log the error
			if browser_output['success'] == False:
				print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result']))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return
			else:
				# no error, treat result as browser output
				browser_output = browser_output['result']

			# attempt to store the output
			if output_store.store(url, browser_output):
				print('\t\t%-50s Success with %s' % (url[:50],browser_type))
			else:
				print('\t\t%-50s Fail with %s' % (url[:50],browser_type))
				sql_driver.log_error(url, 'Unable to load page')

		sql_driver.close()
		return
예제 #4
0
	def execute(self, url, browser_wait):
		"""
		Main function, loads page and analyzes results.
		"""

		print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
		print('Single Site Test On: %s' % url)
		print('\tBrowser type is %s' % self.browser_type)
		print('\tBrowser wait time is %s seconds' % browser_wait)

		# make sure it is an http(s) address
		if not re.match('^https?://', url): 
			print('\tNot a valid url, aborting')
			return None

		# import and set up specified browser driver
		if self.browser_type == 'phantomjs':
			browser_driver 	= PhantomDriver()
		elif self.browser_type == 'chrome':
			browser_driver 	= ChromeDriver()
			chrome_ua = browser_driver.get_ua_for_headless()
			browser_driver 	= ChromeDriver(ua=chrome_ua)

		# attempt to get the page
		browser_output = browser_driver.get_webxray_scan_data(url, browser_wait)

		# if there was a problem we print the error
		if browser_output['success'] == False:
			print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result']))
			return
		else:
			browser_output = browser_output['result']

		# get the ip, fqdn, domain, pubsuffix, and tld from the URL
		# we need the domain to figure out if cookies/elements are third-party
		origin_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url)

		# if we can't get page domain info we bail out
		if origin_ip_fqdn_domain_pubsuffix_tld is None:
			print('could not parse origin domain')
			return None

		origin_ip 			= origin_ip_fqdn_domain_pubsuffix_tld[0]
		origin_fqdn 		= origin_ip_fqdn_domain_pubsuffix_tld[1]
		origin_domain 		= origin_ip_fqdn_domain_pubsuffix_tld[2]
		origin_pubsuffix 	= origin_ip_fqdn_domain_pubsuffix_tld[3]
		origin_tld 			= origin_ip_fqdn_domain_pubsuffix_tld[4]

		print('\n\t------------------{ URL }------------------')
		print('\t'+url)
		print('\n\t------------------{ Final URL }------------------')
		print('\t'+browser_output['final_url'])
		print('\n\t------------------{ Domain }------------------')
		print('\t'+origin_domain)
		print('\n\t------------------{ Seconds to Complete Download }------------------')
		print('\t%s' % (browser_output['load_time']/1000))
		print('\n\t------------------{ 3rd Party Cookies }------------------')
		cookie_list = []
		for cookie in browser_output['cookies']:
			# get domain, pubsuffix, and tld from cookie
			# we have to append http b/c the parser will fail, this is a lame hack, should fix
			cookie_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain'])

			# something went wrong, but we continue to go process the elements
			if cookie_ip_fqdn_domain_pubsuffix_tld is None:
				print('could not parse cookie')
				continue

			# otherwise, everything went fine
			cookie_ip 			= cookie_ip_fqdn_domain_pubsuffix_tld[0]
			cookie_fqdn 		= cookie_ip_fqdn_domain_pubsuffix_tld[1]
			cookie_domain 		= cookie_ip_fqdn_domain_pubsuffix_tld[2]
			cookie_pubsuffix 	= cookie_ip_fqdn_domain_pubsuffix_tld[3]
			cookie_tld 			= cookie_ip_fqdn_domain_pubsuffix_tld[4]

			# print external cookies
			if origin_domain not in cookie_domain:
				cookie_list.append(re.sub('^\.', '', cookie['domain'])+' -> '+cookie['name'])

		cookie_list.sort()
		count = 0
		for cookie in cookie_list:
			count += 1
			print('\t%s) %s' % (count,cookie))

		print('\n\t------------------{ 3p Domains Requested }------------------')
		element_domains = []

		for request in browser_output['processed_requests']:
			# if the request starts with 'data'/etc we can't parse tld anyway, so skip
			if re.match('^(data|about|chrome).+', request):
				continue

			element_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request)

			# problem with this request, bail on it and do the next
			if element_ip_fqdn_domain_pubsuffix_tld is None:
				continue

			element_ip 			= element_ip_fqdn_domain_pubsuffix_tld[0]
			element_fqdn 		= element_ip_fqdn_domain_pubsuffix_tld[1]
			element_domain 		= element_ip_fqdn_domain_pubsuffix_tld[2]
			element_pubsuffix 	= element_ip_fqdn_domain_pubsuffix_tld[3]
			element_tld 		= element_ip_fqdn_domain_pubsuffix_tld[4]
				
			if origin_domain not in element_domain:
				if element_domain not in element_domains:
					element_domains.append(element_domain)
		
		element_domains.sort()

		count = 0
		for domain in element_domains:
			count += 1
			if domain in self.domain_owners:
				lineage = ''
				for item in self.get_lineage(self.domain_owners[domain]):
					lineage += self.id_to_owner[item]+' > '
				print('\t%s) %s [%s]' % (count, domain, lineage[:-3]))
			else:
				print('\t%s) %s [Unknown Owner]' % (count, domain))