def __init__(self): # load up the tld list now as only hit it once this way self.pubsuffix_list = self.get_pubsuffix_list() # this is to speed up tlds lookup with hash table # we can share among all runs over time self.sql_driver = MySQLDriver() # check if sub_domain_tld exists, if not create one # this should only really happen once # # bug? # when using pool this can be done several times in tandem # resulting in some lost entries - e.g. two threads see db not exist the first # one to break the tie will create a new blank db and add a new entry # it is possible the slightly slower second thread will also create a blank db # erasing the entry of the first thread, however, after the first round the db # should not be re-created again # # it should be emphasized this db is ONLY used to speed up parsing the tld # (partially because my regex method is slow and should be refactored!) # the data is this db is NOT used for analysis - thus deleting a few records # will have a minor impact on speed for the first few entries, and then be # wholly irrelevant # # still...it irks me that this entire class could be smarter, so I will # fix it another day if self.sql_driver.check_db_exist('sub_domain_tld') == False: self.sql_driver.create_sub_domain_tld_db() else: self.sql_driver.db_switch('sub_domain_tld')
def __init__(self, db_name, num_tlds, num_results, tracker_threshold=0): self.db_name = db_name self.sql_driver = MySQLDriver(self.db_name) self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold self.startTime = datetime.now() self.pages_ok_count = self.sql_driver.pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t===========================') print('\t Patching DB with Org Data ') print('\t===========================') # update the domains to their owners self.patch_org_data() print('\t\tSuccess!') print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...') self.top_tlds = self.get_top_tlds(self.num_tlds) print(self.top_tlds) print('\t\tSuccess!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: print('\t\t |- %s (%s)' % (tld, pages)) # SPECIAL SAUCE, FOR EXPERTS: tracker domains! # # idea for this is you set a threshold of the number of sites a given domain # is connected to - domains connecting to many sites may correlate those visits # via referer strings etc, so we call these 'tracker domains' # # on a really large set of sites (e.g. 1M+) this works well but on small samples # (e.g. 500) it doesn't work well at all as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing... # DON'T USE THIS! # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # if tracker_threshold: print('\t=========================') print('\t Getting tracker domains ') print('\t=========================') print('\t\tProcessing...') self.tracker_domains = self.get_tracker_domains( self.tracker_threshold) print('\t\tSuccess!') else: self.tracker_domains = []
def run(self, pool_size): try: uri_list = open('./page_lists/' + self.pages_file_name, 'r') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() sql_driver = MySQLDriver(self.db_name) # sort out what uris we are processing from the list uris_to_process = [] count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for uri in uri_list: # skip lines that are comments if "#" in uri[0]: continue count += 1 # drop trailing '/, clean off white space, make lower, create cli-safe uri # with parse.quote, but exclude :/ b/c of http:// uri = re.sub('/$', '', urllib.parse.quote(uri.strip(), safe=":/").lower()) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50])) continue # skip if in db already if sql_driver.page_exists(uri): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50])) continue # only add if not in list already if uri not in uris_to_process: print("\t\t%s | %-50s Adding." % (count, uri[:50])) uris_to_process.append(uri) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50])) print('\t----------------------------------') print('\t%s pages will now be webXray\'d' % len(uris_to_process)) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') myPool = Pool(pool_size) myPool.map(self.process_uri, uris_to_process)
def run(self, pool_size): try: uri_list = open('./page_lists/'+self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() sql_driver = MySQLDriver(self.db_name) # sort out what uris we are processing from the list uris_to_process = [] count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for uri in uri_list: # skip lines that are comments if "#" in uri[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', uri)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50])) continue # drop trailing '/, clean off white space, make lower, create cli-safe uri # with parse.quote, but exclude :/ b/c of http:// uri = re.sub('/$', '', urllib.parse.quote(uri.strip(), safe=":/").lower()) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50])) continue # skip if in db already if sql_driver.page_exists(uri): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50])) continue # only add if not in list already if uri not in uris_to_process: print("\t\t%s | %-50s Adding." % (count, uri[:50])) uris_to_process.append(uri) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50])) print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(uris_to_process)) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') myPool = Pool(pool_size) myPool.map(self.process_uri, uris_to_process)
def process_uri(self, uri): sql_driver = MySQLDriver(self.db_name) output_store = OutputStore(self.db_name) phantom_driver = PhantomDriver( '--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') # this can be higher or lower depending on network load # generally, 90 seems to be fine, so keep with it try: phantom_output = phantom_driver.execute(uri, 90) except: print("\t\t%-50s Phantomjs Did Not Return." % uri[:50]) sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.") return if re.match('^FAIL.+', phantom_output): print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output)) sql_driver.log_error(uri, phantom_output) else: print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output))) # closes our db connections sql_driver.close() output_store.close() return
def __init__(self, db_name, num_tlds, num_results, tracker_threshold = 0): self.db_name = db_name self.sql_driver = MySQLDriver(self.db_name) self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold self.startTime = datetime.now() self.pages_ok_count = self.sql_driver.pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t===========================') print('\t Patching DB with Org Data ') print('\t===========================') # update the domains to their owners self.patch_org_data() print('\t\tSuccess!') print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...') self.top_tlds = self.get_top_tlds(self.num_tlds) print(self.top_tlds) print('\t\tSuccess!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: print('\t\t |- %s (%s)' % (tld,pages)) # SPECIAL SAUCE, FOR EXPERTS: tracker domains! # # idea for this is you set a threshold of the number of sites a given domain # is connected to - domains connecting to many sites may correlate those visits # via referer strings etc, so we call these 'tracker domains' # # on a really large set of sites (e.g. 1M+) this works well but on small samples # (e.g. 500) it doesn't work well at all as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing... # DON'T USE THIS! # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # if tracker_threshold: print('\t=========================') print('\t Getting tracker domains ') print('\t=========================') print('\t\tProcessing...') self.tracker_domains = self.get_tracker_domains(self.tracker_threshold) print('\t\tSuccess!') else: self.tracker_domains = []
async def post_analyze_url(request: XrayAnalyseRequest): print('Data recieve.') status = False cookies = [] driver = MySQLDriver('wbxr_gayatri') try: url = unpadPKCS7(decrypt(request.url)) url = url.decode('utf8') print('Encoded', url) if history_filter(url): driver.db.execute( "SELECT cookie.`domain` " "FROM page " "LEFT JOIN page_cookie_junction " "ON page.id = page_cookie_junction.page_id " "LEFT JOIN cookie " "ON page_cookie_junction.cookie_id = cookie.id " "WHERE page.start_uri_md5 = MD5(%s)", (url,), ) fetched = driver.db.fetchall() if fetched: cookies = fetched else: cookies = analyze_url(url) else: cookies = [] print(cookies) except Exception as e: print(e) else: status = True return JSONResponse(content={"status": status, "cookies": cookies})
def process_uri(self, uri): sql_driver = MySQLDriver(self.db_name) output_store = OutputStore(self.db_name) phantom_driver = PhantomDriver('--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') # this can be higher or lower depending on network load # generally, 90 seems to be fine, so keep with it try: phantom_output = phantom_driver.execute(uri, 90) except: print("\t\t%-50s Phantomjs Did Not Return." % uri[:50]) sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.") return if re.match('^FAIL.+', phantom_output): print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output)) sql_driver.log_error(uri, phantom_output) else: print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output))) # closes our db connections sql_driver.close() output_store.close() return
if sys.version_info[1] < 4: print( 'Python 3.4 or above is required for webXray to function; please check your installation.' ) exit() # standard python 3.4 libs import os import re import time from optparse import OptionParser # set up a global mysql driver, in the future you could use other db drivers here # if the mysql connector is not installed this fails gracefully from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver() # databases are stored with a 'wbxr_' prefix, this function helps select a database in interactive mode def select_wbxr_db(): wbxr_dbs = sql_driver.get_wbxr_dbs_list() if len(wbxr_dbs) == 0: print( '''\t\tThere are no databases to analyze, please try [C]ollecting data or import an existing wbxr-formatted database manually.''') interaction() return for index, db_name in enumerate(wbxr_dbs): print('\t\t[%s] %s' % (index, db_name[5:]))
def __init__(self, dbname): self.uri_parser = ParseURI() self.sql_driver = MySQLDriver(dbname)
class OutputStore: def __init__(self, dbname): self.uri_parser = ParseURI() self.sql_driver = MySQLDriver(dbname) # end init def store(self, uri, phantom_output): # parse out the json from our phantom_output # sometimes phantom prints out errors before the json, (despite us turning # it off!), so we match inside of the {}s to get json only try: data = json.loads(re.search('(\{.+\})', phantom_output).group(1)) except Exception as e: self.sql_driver.log_error(uri, "Could Not Load JSON: %s" % e) return 'Could Not Load JSON' # we need to parse the domain to determine if requests are local or 3rd party # we need pubsuffix and tld for later analysis so store them now origin_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(uri) origin_domain = origin_domain_pubsuffix_tld[0] origin_pubsuffix = origin_domain_pubsuffix_tld[1] origin_tld = origin_domain_pubsuffix_tld[2] if re.match('^Exception.+', origin_domain): self.sql_driver.log_error(uri, 'Could not parse TLD for %s' % uri) return 'Could not parse TLD for %s' % uri page_domain_id = self.sql_driver.add_domain(origin_domain, origin_pubsuffix, origin_tld) # newFollowLogger is giving us the follow JSON data: # note source is null for now, but store anyway # big_out = { # final_uri: final_uri, # title: page.title, # meta_desc : meta_desc, # requested_uris: JSON.stringify(requested_uris), # received_uris: JSON.stringify(received_uris), # cookies: phantom.cookies, # source: 'NULL', # }; # we are now storing uris with and without args and saving args # we must unquote to uri to get back to original state so can parse start_uri_original = urllib.parse.unquote(uri) try: start_uri_no_args = re.search('^(.+?)\?.+$', start_uri_original).group(1) # start uri no args except: start_uri_no_args = uri try: start_uri_args = re.search('^.+(\?.+)$', start_uri_original).group(1) # start uri args except: start_uri_args = 'NULL' # same for the final uri (this is where we are after potential redirects) final_uri = re.sub('\"', '', json.dumps(data["final_uri"])) final_uri_original = urllib.parse.unquote(final_uri) try: final_uri_no_args = re.search('^(.+?)\?.+$', final_uri_original).group(1) # start uri no args except: final_uri_no_args = final_uri try: final_uri_args = re.search('^.+(\?.+)$', final_uri_original).group(1) # start uri args except: final_uri_args = 'NULL' # add page # json.dumps to make sure strings go out ok for db page_id = self.sql_driver.add_page( str(re.sub('\"', '', json.dumps(data["title"]))), str(re.sub('\"', '', json.dumps(data["meta_desc"]))), uri, start_uri_no_args, start_uri_args, final_uri, final_uri_no_args, final_uri_args, str(re.sub('\"', '', json.dumps(data["source"]))), str(re.sub('\"', '', json.dumps(data["requested_uris"]))), str(re.sub('\"', '', json.dumps(data["received_uris"]))), page_domain_id) for cookie in data["cookies"]: # store external cookies, uri_parser fails on non-http, we should fix this # right now a lame hack is to prepend http:// cookie_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld("http://"+cookie["domain"]) cookie_domain = cookie_domain_pubsuffix_tld[0] cookie_pubsuffix = cookie_domain_pubsuffix_tld[1] cookie_tld = cookie_domain_pubsuffix_tld[2] # something went wrong, but carry on... if re.match('^Exception.+', cookie_domain): self.sql_driver.log_error(uri, 'Error parsing cookie: '+cookie_domain) continue # this is a 3party cookie if origin_domain != cookie_domain: cookie_domain_id = self.sql_driver.add_domain(cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie["name"] except: continue try: domain = cookie_domain except: continue # these are optional, keep going with "N/A" vals try: secure = cookie["secure"] except: secure = "N/A" try: path = cookie["path"] except: path = "N/A" try: expires = cookie["expires"] except: expires = "N/A" try: httponly = cookie["httponly"] except: httponly = "N/A" try: expiry = cookie["expiry"] except: expiry = "N/A" try: value = cookie["value"] except: value = "N/A" cookie_id = self.sql_driver.add_cookie( name, secure, path, domain, expires, httponly, expiry, value, cookie_domain_id) self.sql_driver.add_cookie_to_page(cookie_id, page_id) for request in data["requested_uris"]: # if the request starts with "data" we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue # get domain, pubsuffix, and tld from request requested_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(request) requested_domain = requested_domain_pubsuffix_tld[0] requested_pubsuffix = requested_domain_pubsuffix_tld[1] requested_tld = requested_domain_pubsuffix_tld[2] # see if we got back what we requested, if not a few things may have happened # * malformed uri # * resource is gone or never existed # * network latency (ie it didn't arrive in window specified) # * we could be behind a firewall or censorship mechanism (eg gfw, golden shield) # * our IP is blacklisted b/c we are totally a bot X-D # the point being, interpret this value with an open mind if request in data['received_uris']: recieved = '1' else: recieved = '0' # catch exceptions if re.match('^Exception.+', requested_domain): self.sql_driver.log_error(uri, 'Error parsing element request: '+request) continue # store new elements if origin_domain != requested_domain: full_uri = request try: element_uri = re.search('^(.+?)\?.+$', full_uri).group(1) # start uri no args except: element_uri = full_uri # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_uri).group(1).lower() except: element_extension = "NULL" # figure out what type of element it is if element_extension == 'png' or element_extension == 'jpg' or element_extension == 'jpgx' or element_extension == 'jpeg' or element_extension == 'gif' or element_extension == 'svg' or element_extension == 'bmp' or element_extension == 'tif' or element_extension == 'tiff' or element_extension == 'webp' or element_extension == 'srf': element_type = 'image' elif element_extension == 'js' or element_extension == 'javascript': element_type = 'javascript' elif element_extension == 'json' or element_extension == 'jsonp' or element_extension == 'xml': element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension == 'woff' or element_extension == 'ttf' or element_extension == 'otf': element_type = 'font' elif element_extension == 'htm' or element_extension == 'html' or element_extension == 'shtml': element_type = 'page_static' elif element_extension == 'php' or element_extension == 'asp' or element_extension == 'jsp' or element_extension == 'aspx' or element_extension == 'ashx' or element_extension == 'pl' or element_extension == 'cgi' or element_extension == 'fcgi': element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' elif element_extension == 'NULL': element_type = 'NULL' else: element_type = 'unknown' try: args = re.search('^.+(\?.+)$', full_uri).group(1) # start uri args except: args = 'NULL' element_domain_id = self.sql_driver.add_domain(requested_domain, requested_pubsuffix, requested_tld) element_id = self.sql_driver.add_element("NULL", full_uri, element_uri, recieved, element_extension, element_type, args, element_domain_id) self.sql_driver.add_element_to_page(element_id, page_id) return 'Successfully Added to DB' # end report() def close(self): # close mysql connections self.uri_parser.close() self.sql_driver.close() return
class ParseURI: def __init__(self): # load up the tld list now as only hit it once this way self.pubsuffix_list = self.get_pubsuffix_list() # this is to speed up tlds lookup with hash table # we can share among all runs over time self.sql_driver = MySQLDriver() # check if sub_domain_tld exists, if not create one # this should only really happen once # # bug? # when using pool this can be done several times in tandem # resulting in some lost entries - e.g. two threads see db not exist the first # one to break the tie will create a new blank db and add a new entry # it is possible the slightly slower second thread will also create a blank db # erasing the entry of the first thread, however, after the first round the db # should not be re-created again # # it should be emphasized this db is ONLY used to speed up parsing the tld # (partially because my regex method is slow and should be refactored!) # the data is this db is NOT used for analysis - thus deleting a few records # will have a minor impact on speed for the first few entries, and then be # wholly irrelevant # # still...it irks me that this entire class could be smarter, so I will # fix it another day if self.sql_driver.check_db_exist('sub_domain_tld') == False: self.sql_driver.create_sub_domain_tld_db() else: self.sql_driver.db_switch('sub_domain_tld') # end __init__ def get_pubsuffix_list(self): # get the file from the local dir pubsuffix_raw_list = open( os.path.join( os.path.dirname(__file__), './resources/pubsuffix/patchedPublicSuffixList-20150514.txt'), 'r') pubsuffix_list = [] for line in pubsuffix_raw_list: # the last part of the list is random shit we don't care about, so stop reading if re.match("^// ===BEGIN PRIVATE DOMAINS===", line): break # skip lines that are comments or blank, add others to list # also remove leading ., !, and * as it f***s up regex later if not re.match("^//.+$|^$", line): pubsuffix_list.append(re.sub('^[\!\*]\.?', '', line.strip())) # we sort long->short so we can match deeper TLD first (ie, ac.uk *before* .uk) pubsuffix_list.sort(key=len, reverse=True) # to speed things up we move the most common TLD to the front of the line # that said, if it's not one of these we take a MASSIVE performance hit popular_pubsuffixs = [ 'gov', 'co.uk', 'mil', 'org', 'net', 'edu', 'com' ] for popular_pubsuffix in popular_pubsuffixs: pubsuffix_list.remove(popular_pubsuffix) pubsuffix_list.insert(0, popular_pubsuffix) return pubsuffix_list # get_pubsuffix_list def get_domain_pubsuffix_tld(self, uri): # pull out the first chunk of the domain, possibly including subdomains # if this fails, the domain is f****d up (or not https? at least), kill # only handles if the domain is followed by $, ?, \, or / - other shit will break... # adding to handle with port at end, ie blah.com:8080, so rule is '|\:[0-9].+ # adding \.? at the top to handle leading '.' try: sub_domain = re.search( '^https?:\/\/\.?(.*?)(\:[0-9].+)?($|\/|\?|\\\\|=)', uri).group(1) except: return ('Exception: Unable to parse: ' + uri[:50], 'Exception: Unable to parse: ' + uri[:50], 'Exception: Unable to parse: ' + uri[:50]) # strip off leading or trailing '.', this is f*****g things up sub_domain = re.sub('(^\.|\.$)', '', sub_domain) # see if it is an ip address, if so return it # this is only ipv4 pattern though, maybe should thinking about ipv6? try: re.search('(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$)', sub_domain).group(1) return (sub_domain, 'None', 'None') except: pass # first, we will see if it is in the db already record = self.sql_driver.sub_domain_exists(sub_domain) if record: return record # the pubsuffix list is large -> small, # so the first match is the one we want # after we find it, break out and continue for pubsuffix_try in self.pubsuffix_list: if re.match(".+\." + pubsuffix_try + "$", sub_domain): pubsuffix = pubsuffix_try break # if we didn't find the pubsuffix we fail try: pubsuffix except: return ('Exception: Unable to parse: ' + uri[:50], 'Exception: Unable to parse: ' + uri[:50], 'Exception: Unable to parse: ' + uri[:50]) # if we have sub.domain.tld_match, we just want domain.tld_match # there is no reason this should fail if we get this far, but try/except to be safe try: domain = re.search('(.*\.)?\.?(.*\.' + pubsuffix + ')$', sub_domain).group(2) except: return ('Exception: Unable to parse: ' + uri[:50], 'Exception: Unable to parse: ' + uri[:50], 'Exception: Unable to parse: ' + uri[:50]) # grab the tld off of the pubsuffix # if regex fails the tld and pubsuffix are the same try: tld = re.search('\.([0-9A-Za-z]+)$', pubsuffix).group(1).lower() except: tld = pubsuffix self.sql_driver.add_sub_domain_pubsuffix_tld(sub_domain, domain, pubsuffix, tld) return (domain, pubsuffix, tld) #end get_domain_pubsuffix_tld def close(self): # close mysql connection self.sql_driver.close() return # end close #end ParseURI
def __init__(self, db_engine, db_name, num_tlds, num_results, tracker_threshold=None, flush_owner_db=True): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_engine = db_engine self.db_name = db_name self.num_tlds = num_tlds self.top_tlds = [] self.num_results = num_results self.tracker_threshold = tracker_threshold self.start_time = datetime.now() # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver self.sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # this is reused often, do it once to save time self.get_pages_ok_count = self.sql_driver.get_pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t============================') print('\t Patching Domain Owner Data ') print('\t============================') if flush_owner_db: # update the domains to their owners in the db, can be overridden # by changing flush_owner_db to false self.patch_domain_owners() else: print('\t\t\tSkipping') # this is used in various places to get owner information self.domain_owners = self.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: if tld: print('\t\t |- %s (%s)' % (tld, pages)) else: # othewise we push in a single empty entry self.top_tlds.append((None, self.get_pages_ok_count)) # SPECIAL FEATURE FOR EXPERTS: tracker domain filter # # you can set a threshold of the number of sites a given 3p domain # is connected to - domains connecting to many sites may correlate those visits # so we call these 'tracker domains' # # the 'tracker_threshold' variable set above controls the filtering level # # on large set of sites (e.g. >10k) this works well but on small samples # (e.g. <500) it doesn't work as well as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing # don't use this...but because you are reading the source code for an otherwise # undocumented feature you are probably competent to use it ;-) # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # # use at your own risk! if tracker_threshold: print('\t===================================================') print('\t Getting tracker domains with threshold level of %s' % self.tracker_threshold) print('\t===================================================') print('\t\tProcessing...', end='', flush=True) self.tracker_domains = self.get_tracker_domains( self.tracker_threshold) print('done!') else: # set to None so various downstream operations get skipped self.tracker_domains = None
def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # figure out the privacy policy url and text, starts null priv_policy_url = None priv_policy_url_text = None # read in our list of privacy link terms from the json file in webxray/resources/policyxray privacy_policy_term_list = self.utilities.get_privacy_policy_term_list() # we reverse links return from browser to check footer links first as that is where policy links tend to be all_links = browser_output['all_links'] all_links.reverse() # if we have links search for privacy policy if len(all_links) > 0: # links are tuple for link_text,link_url in all_links: # makes sure we have text, skip links without if link_text: # need lower for string matching link_text = link_text.lower().strip() # not a link we can use if 'javascript' in link_text: continue # see if the link_text is in our term list if link_text in privacy_policy_term_list: # if the link_url is relative this will convert to absolute priv_policy_url = self.utilities.get_absolute_url_from_page_link(url,link_url) priv_policy_url_text = link_text break # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: # handles issue where postgres will crash on inserting null character source = browser_output['source'].replace('\x00',' ') else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], priv_policy_url, priv_policy_url_text, page_is_ssl, source, browser_output['load_time'], page_domain_id ) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie( page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id ) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing element request: '+request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https' or request[:3] == 'wss': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request]['received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request]['referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error(url, 'Error parsing referer header: '+referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request]['load_time'] except: load_time = None try: status = browser_output['processed_requests'][request]['status'] except: status = None try: status_text = browser_output['processed_requests'][request]['status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request]['content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request]['body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'][request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'][request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf'] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None # file hashing has non-trivial overhead and off by default # # what this does is uses the same ua/referer as the actual request # so we are just replaying the last one to get similar response # note that we aren't sending the same cookies so that could be an issue # otherwise it is equivalent to a page refresh in theory # option to hash only 3p elements observed here if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False): replay_element_request = urllib.request.Request( request, headers = { 'User-Agent' : browser_output['processed_requests'][request]['user_agent'], 'Referer' : referer, 'Accept' : '*/*' } ) try: file_md5 = hashlib.md5(urllib.request.urlopen(replay_element_request,timeout=10).read()).hexdigest() except: file_md5 = None else: file_md5 = None # final tasks is to truncate the request if it is # over 2k characters as it is likely # binary data and may cause problems inserting # into TEXT fields in database # # TODO: # better handle binary data in general if len(request) >= 2000: request = request[:2000] if len(element_url) >= 2000: element_url = element_url[:2000] # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, file_md5, element_extension, element_type, element_args, element_domain_id ) # close db connection sql_driver.close() return True
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r', encoding='utf-8') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already unless we are doing a timeseries if self.allow_timeseries == False: if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem browser_output will be None if browser_output == None: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already unless we are doing a timeseries if self.allow_timeseries == False: if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()
class OutputStore: def __init__(self, dbname): self.uri_parser = ParseURI() self.sql_driver = MySQLDriver(dbname) # end init def store(self, uri, phantom_output): # parse out the json from our phantom_output # sometimes phantom prints out errors before the json, (despite us turning # it off!), so we match inside of the {}s to get json only try: data = json.loads(re.search('(\{.+\})', phantom_output).group(1)) except Exception as e: self.sql_driver.log_error(uri, "Could Not Load JSON: %s" % e) return 'Could Not Load JSON' # we need to parse the domain to determine if requests are local or 3rd party # we need pubsuffix and tld for later analysis so store them now origin_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld( uri) origin_domain = origin_domain_pubsuffix_tld[0] origin_pubsuffix = origin_domain_pubsuffix_tld[1] origin_tld = origin_domain_pubsuffix_tld[2] if re.match('^Exception.+', origin_domain): self.sql_driver.log_error(uri, 'Could not parse TLD for %s' % uri) return 'Could not parse TLD for %s' % uri page_domain_id = self.sql_driver.add_domain(origin_domain, origin_pubsuffix, origin_tld) # newFollowLogger is giving us the follow JSON data: # note source is null for now, but store anyway # big_out = { # final_uri: final_uri, # title: page.title, # meta_desc : meta_desc, # requested_uris: JSON.stringify(requested_uris), # received_uris: JSON.stringify(received_uris), # cookies: phantom.cookies, # source: 'NULL', # }; # we are now storing uris with and without args and saving args # we must unquote to uri to get back to original state so can parse start_uri_original = urllib.parse.unquote(uri) try: start_uri_no_args = re.search('^(.+?)\?.+$', start_uri_original).group( 1) # start uri no args except: start_uri_no_args = uri try: start_uri_args = re.search('^.+(\?.+)$', start_uri_original).group( 1) # start uri args except: start_uri_args = 'NULL' # same for the final uri (this is where we are after potential redirects) final_uri = re.sub('\"', '', json.dumps(data["final_uri"])) final_uri_original = urllib.parse.unquote(final_uri) try: final_uri_no_args = re.search('^(.+?)\?.+$', final_uri_original).group( 1) # start uri no args except: final_uri_no_args = final_uri try: final_uri_args = re.search('^.+(\?.+)$', final_uri_original).group( 1) # start uri args except: final_uri_args = 'NULL' # add page # json.dumps to make sure strings go out ok for db page_id = self.sql_driver.add_page( str(re.sub('\"', '', json.dumps(data["title"]))), str(re.sub('\"', '', json.dumps(data["meta_desc"]))), uri, start_uri_no_args, start_uri_args, final_uri, final_uri_no_args, final_uri_args, str(re.sub('\"', '', json.dumps(data["source"]))), str(re.sub('\"', '', json.dumps(data["requested_uris"]))), str(re.sub('\"', '', json.dumps(data["received_uris"]))), page_domain_id) for cookie in data["cookies"]: # store external cookies, uri_parser fails on non-http, we should fix this # right now a lame hack is to prepend http:// cookie_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld( "http://" + cookie["domain"]) cookie_domain = cookie_domain_pubsuffix_tld[0] cookie_pubsuffix = cookie_domain_pubsuffix_tld[1] cookie_tld = cookie_domain_pubsuffix_tld[2] # something went wrong, but carry on... if re.match('^Exception.+', cookie_domain): self.sql_driver.log_error( uri, 'Error parsing cookie: ' + cookie_domain) continue # this is a 3party cookie if origin_domain != cookie_domain: cookie_domain_id = self.sql_driver.add_domain( cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie["name"] except: continue try: domain = cookie_domain except: continue # these are optional, keep going with "N/A" vals try: secure = cookie["secure"] except: secure = "N/A" try: path = cookie["path"] except: path = "N/A" try: expires = cookie["expires"] except: expires = "N/A" try: httponly = cookie["httponly"] except: httponly = "N/A" try: expiry = cookie["expiry"] except: expiry = "N/A" try: value = cookie["value"] except: value = "N/A" cookie_id = self.sql_driver.add_cookie(name, secure, path, domain, expires, httponly, expiry, value, cookie_domain_id) self.sql_driver.add_cookie_to_page(cookie_id, page_id) for request in data["requested_uris"]: # if the request starts with "data" we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue # get domain, pubsuffix, and tld from request requested_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld( request) requested_domain = requested_domain_pubsuffix_tld[0] requested_pubsuffix = requested_domain_pubsuffix_tld[1] requested_tld = requested_domain_pubsuffix_tld[2] # see if we got back what we requested, if not a few things may have happened # * malformed uri # * resource is gone or never existed # * network latency (ie it didn't arrive in window specified) # * we could be behind a firewall or censorship mechanism (eg gfw, golden shield) # * our IP is blacklisted b/c we are totally a bot X-D # the point being, interpret this value with an open mind if request in data['received_uris']: recieved = '1' else: recieved = '0' # catch exceptions if re.match('^Exception.+', requested_domain): self.sql_driver.log_error( uri, 'Error parsing element request: ' + request) continue # store new elements if origin_domain != requested_domain: full_uri = request try: element_uri = re.search('^(.+?)\?.+$', full_uri).group( 1) # start uri no args except: element_uri = full_uri # attempt to parse off the extension try: element_extension = re.search( '\.([0-9A-Za-z]+)$', element_uri).group(1).lower() except: element_extension = "NULL" # figure out what type of element it is if element_extension == 'png' or element_extension == 'jpg' or element_extension == 'jpgx' or element_extension == 'jpeg' or element_extension == 'gif' or element_extension == 'svg' or element_extension == 'bmp' or element_extension == 'tif' or element_extension == 'tiff' or element_extension == 'webp' or element_extension == 'srf': element_type = 'image' elif element_extension == 'js' or element_extension == 'javascript': element_type = 'javascript' elif element_extension == 'json' or element_extension == 'jsonp' or element_extension == 'xml': element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension == 'woff' or element_extension == 'ttf' or element_extension == 'otf': element_type = 'font' elif element_extension == 'htm' or element_extension == 'html' or element_extension == 'shtml': element_type = 'page_static' elif element_extension == 'php' or element_extension == 'asp' or element_extension == 'jsp' or element_extension == 'aspx' or element_extension == 'ashx' or element_extension == 'pl' or element_extension == 'cgi' or element_extension == 'fcgi': element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' elif element_extension == 'NULL': element_type = 'NULL' else: element_type = 'unknown' try: args = re.search('^.+(\?.+)$', full_uri).group(1) # start uri args except: args = 'NULL' element_domain_id = self.sql_driver.add_domain( requested_domain, requested_pubsuffix, requested_tld) element_id = self.sql_driver.add_element( "NULL", full_uri, element_uri, recieved, element_extension, element_type, args, element_domain_id) self.sql_driver.add_element_to_page(element_id, page_id) return 'Successfully Added to DB' # end report() def close(self): # close mysql connections self.uri_parser.close() self.sql_driver.close() return
class Reporter: def __init__(self, db_name, num_tlds, num_results, tracker_threshold = 0): self.db_name = db_name self.sql_driver = MySQLDriver(self.db_name) self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold self.startTime = datetime.now() self.pages_ok_count = self.sql_driver.pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t===========================') print('\t Patching DB with Org Data ') print('\t===========================') # update the domains to their owners self.patch_org_data() print('\t\tSuccess!') # if num_tlds is 0 we don't care about sub-analysis on the tlds (which takes time) # so we just push in the wildcard with the total page count # note that this is the *default* behavior if self.num_tlds == 0: self.top_tlds = [] self.top_tlds.append(('*',self.pages_ok_count)) # otherwise, we *do* care about the tlds, so get them else: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...') self.top_tlds = self.get_top_tlds(self.num_tlds) print('\t\tSuccess!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: print('\t\t |- %s (%s)' % (tld,pages)) # SPECIAL SAUCE, FOR EXPERTS: tracker domains! # # idea for this is you set a threshold of the number of sites a given domain # is connected to - domains connecting to many sites may correlate those visits # via referer strings etc, so we call these 'tracker domains' # # on a really large set of sites (e.g. 1M+) this works well but on small samples # (e.g. 500) it doesn't work well at all as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing... # DON'T USE THIS! # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # if tracker_threshold: print('\t=========================') print('\t Getting tracker domains ') print('\t=========================') print('\t\tProcessing...') self.tracker_domains = self.get_tracker_domains(self.tracker_threshold) print('\t\tSuccess!') else: # set to False so various downstream operations get skipped self.tracker_domains = False # end __init__ ######################### # HELPERS, GENERAL # ######################### def setup_report_dir(self): # create directory for where the reports go if not exist if os.path.exists('./reports') == False: print('\t\tMaking global reports directory at ./reports.') os.makedirs('./reports') # set global report_path, trim off the wbxr_ prefix self.report_path = './reports/'+self.db_name[5:] # set up subdir for this database analysis if os.path.exists(self.report_path) == False: print('\t\tMaking subdirectory for reports at %s' % self.report_path) os.makedirs(self.report_path) # just a notice print('\t\tStoring output in %s' % self.report_path) # setup_report_dir # reuse this a lot def write_csv(self, file_name, csv_rows): full_file_path = self.report_path+'/'+file_name file_out = open(full_file_path, 'w') for row in csv_rows: file_out.write(row) file_out.close() print('\t\t'+'*'*40) print('\t\tOutput written to %s' % full_file_path) # write_csv # just fyi def print_runtime(self): print('~='*40) print("Finished!") print("Time to process: "+str(datetime.now()-self.startTime)+"\n") print('-'*80) # end print_runtime # X-( def fatal(self, msg): print('FATAL ERROR: %s' % msg) print('EXITING.') exit() # fatal ############################# # HELPERS, DATABASE/INIT # ############################# def patch_org_data(self): # in order to analyze what entities receive user data, we need to update # the database with domain ownership records we have store previously # # we first clear out what is in there in case the new data has changed # perhaps make this optional, on big dbs takes a while # print('\t\tFlushing extant org data...') self.sql_driver.reset_domains_orgs() # next we pull the org/domain pairings from the json file in the resources dir # and add to the db print('\t\tPatching with new org data...') raw_data = open(os.path.join(os.path.dirname(__file__), './resources/org_domains/org_domains.json'), 'r') json_data = json.load(raw_data) # the default id for orgs is 1, so we advance from there id = 1 for item in json_data: id += 1 self.sql_driver.add_org(id, item['organization'], item['notes'], item['country']) for domain in item['domains']: self.sql_driver.update_domain_org(id, domain) # end patch_org_data def get_top_tlds(self, limit, type = 'tld'): # finds the most common tlds from all the pages # type is default to tld, but pubsuffix also works # have to do some weird sorting b/c python is arbitrary and f***s up diff tests # returns array tlds = [] for row in self.sql_driver.get_all_tlds(type): tlds.append(row[0]) top_tlds = collections.Counter(tlds).most_common() # sort alphabetical top_tlds.sort() # sub-sort on num occurances top_tlds.sort(reverse=True, key=lambda item:item[1]) # cut the array top_tlds = top_tlds[0:limit] # push in wild cards top_tlds.insert(0, ('*',self.pages_ok_count)) return top_tlds # end get_top_tlds def get_tracker_domains(self, threshold): # NOTE: first determines all pairings of page domains and element domains # note this is then unique on SITES, not on PAGES # e.g. if you have several pages from the same site these links only # count once # # RETURNS: list of domains which link at least the threshold number of sites all_domains = [] for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs(): all_domains.append(page_domain_element_domain[1]) # count up all the pairs, convert to items() so can process as tuples domain_counts = collections.Counter(all_domains).items() # put the return values here tracker_domains = [] # check against threshold for domain_count in domain_counts: if domain_count[1] >= threshold: tracker_domains.append(domain_count[0]) # EDGE CASE # likely due to a large threshold we have no tracker domains, # so we throw warning and log error if len(tracker_domains) == 0: self.sql_driver.log_error('Analaysis Warning', 'Tracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------WARNING-----------') print('\t\tTracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------------------------') return tracker_domains # get_tracker_domains ######################### # REPORTS, GENERAL # ######################### def header(self): # just outputs really basic data about how many records in db, etc. # print('\t=================') print('\t General Summary') print('\t=================') output_for_csv = [] #write csv header output_for_csv.append('"Item","Value"\n') total_pages_ok = self.sql_driver.pages_ok_count() print("\t\tTotal Pages OK:\t\t\t%s" % total_pages_ok) output_for_csv.append('"Total Pages OK","%s"\n' % total_pages_ok) total_pages_noload = self.sql_driver.pages_noload_count() total_pages_attempted = total_pages_ok + total_pages_noload print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload) output_for_csv.append('"Total Pages FAIL","%s"\n' % total_pages_noload) print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted) output_for_csv.append('"Total Pages Attempted","%s"\n' % total_pages_attempted) percent_pages_OK = int((total_pages_ok/total_pages_attempted)*100) print("\t\t%% Pages OK:\t\t\t%s%%" % percent_pages_OK) output_for_csv.append('"%% Pages OK","%s"\n' % percent_pages_OK) total_errors = self.sql_driver.total_errors_count() print("\t\tTotal Errors:\t\t\t%s" % total_errors) output_for_csv.append('"Total Errors","%s"\n' % total_errors) total_cookies = self.sql_driver.total_cookie_count() print("\t\tTotal Cookies:\t\t\t%s" % total_cookies) output_for_csv.append('"Total Cookies","%s"\n' % total_cookies) total_pages_with_cookies = self.sql_driver.pages_w_cookie_count() print("\t\tPages with Cookies:\t\t%s" % total_pages_with_cookies) output_for_csv.append('"Pages with Cookies","%s"\n' % total_pages_with_cookies) percent_with_cookies = (total_pages_with_cookies/total_pages_ok)*100 print("\t\t%% Pages with Cookies:\t\t%s%%" % int(percent_with_cookies)) output_for_csv.append('"%% Pages with Cookies","%s"\n' % int(percent_with_cookies)) total_elements = self.sql_driver.total_element_count() print("\t\tTotal Elements Requested:\t%s" % total_elements) output_for_csv.append('"Total Elements Requested","%s"\n' % total_elements) total_elements_received = self.sql_driver.total_element_count(received = True) print("\t\tTotal Elements Received:\t%s" % total_elements_received) output_for_csv.append('"Total Elements Received","%s"\n' % total_elements_received) percent_element_received = int((total_elements_received/total_elements)*100) print('\t\t%% Elements Received:\t\t%s%%' % percent_element_received) output_for_csv.append('"%% Elements Received","%s"\n' % percent_element_received) total_pages_with_elements = self.sql_driver.pages_w_element_count() print("\t\tPages with Elements:\t\t%s" % total_pages_with_elements) output_for_csv.append('"Pages with Elements","%s"\n' % total_pages_with_elements) percent_with_elements = (total_pages_with_elements/total_pages_ok)*100 print("\t\t%% Pages with Elements:\t\t%s%%" % int(percent_with_elements)) output_for_csv.append('"%% Pages With Elements","%s"\n' % int(percent_with_elements)) self.write_csv('db_summary.csv', output_for_csv) print('\t'+'-'*80+'\n') # header def get_network_ties(self): # this report generates data necessary for graph/network analysis by # outputting a list of page domains and the elements/orgs they connect to print('\t=============================') print('\t Processing Network Ties ') print('\t=============================') # put output here output_for_csv = [] # header row for csv output_for_csv.append('"page_domain","3p_org","3p_domain"\n') # sql_driver.get_network_ties returns a set of tuples in the format # (page domain, page org, element domain, element org) # we just go through this data to produce the report # note: the report is currently omitting page org, but it can be altered easily for edge in self.sql_driver.get_network_ties(): # if a page has no elements, edge[2] will be 'None' so we skip it # an alternate approach would be to include as orphan nodes if edge[2]: output_for_csv.append('"%s","%s","%s",\n' % (edge[0],edge[3],edge[2])) # done! self.write_csv('network.csv', output_for_csv) print('\t'+'-'*80+'\n') # get_network_ties def get_summary_by_tld(self): print('\t=============================') print('\t Processing Summaries by TLD ') print('\t=============================') output_for_csv = [] output_for_csv.append('"TLD","N","% TOTAL","N W/3PE","% W/3PE","N W/COOKIE","% W/COOKIE","N W/JS","% W/JS","3P DOMAIN MEAN","3P DOMAIN MEDIAN","3P DOMAIN MODE","TRACKER FILTER DEPTH"\n') # now do per-tld numbers for tld in self.top_tlds: print('\t\tGetting summary for %s' % tld[0]) if tld[0] != '*': tld_filter = tld[0] else: tld_filter = '' total_pages = self.sql_driver.get_complex_page_count(tld_filter) total_pages_percent = (total_pages/self.pages_ok_count)*100 total_pages_elements = self.sql_driver.get_complex_page_count(tld_filter, 'elements', self.tracker_domains) percent_with_elements = (total_pages_elements/total_pages)*100 total_pages_cookies = self.sql_driver.get_complex_page_count(tld_filter, 'cookies', self.tracker_domains) percent_with_cookies = (total_pages_cookies/total_pages)*100 total_pages_js = self.sql_driver.get_complex_page_count(tld_filter, 'javascript', self.tracker_domains) percent_with_js = (total_pages_js/total_pages)*100 if self.tracker_threshold: filter_depth = self.tracker_threshold else: filter_depth = 'No Tracker Filter Used' stats = self.get_page_3p_stats(tld[0]) mean = stats[0] median = stats[1] mode = stats[2] output_for_csv.append('"%s","%s","%.2f","%s","%.2f","%s","%.2f","%s","%.2f","%.2f","%s","%s","%s"\n' % ( tld[0], total_pages, total_pages_percent, total_pages_elements, percent_with_elements, total_pages_cookies, percent_with_cookies, total_pages_js, percent_with_js, mean, median, mode, filter_depth)) self.write_csv('summary_by_tld.csv', output_for_csv) # end get_summary_by_tld ##################### # REPORTS, MAIN # ##################### def get_reports_by_tld(self, type='', sub_type=''): print('\t=============================') print('\tProcessing Top %s %s %s' % (self.num_results, type, sub_type)) print('\t=============================') # keep output here csv_output = [] # write out the header row for the csv if type is 'elements': csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Organization","Country", "Element","Extension","Type","Domain","Total Pages","Raw Count","Percent Total"\n') elif type is 'domains': csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Domain","Organization","Country","Number of Pages","Raw Count","Percent Total"\n') elif type is 'orgs': csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Organization","Country","Number of Pages","Raw Count","Percent Total"\n') else: self.fatal('Wrong type specified in get_reports_by_tld, must be "elements", "domains", or "orgs".') tld_count = 0 for tld in self.top_tlds: current_tld = tld[0] total_pages = tld[1] print('\t\tcurrently on: '+current_tld) # filter on current page tld if tld[0] != '*': tld_filter = tld[0] tld_count += 1 else: tld_filter = '' # get results with specified filter results_rows = self.get_results_rows(total_pages, type, sub_type, tld_filter) current_row = 0 # loop results for result_row in results_rows: current_row += 1 if type is 'elements': csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # country result_row[0][2], # element result_row[0][3], # extension result_row[0][4], # type result_row[0][5], # domain total_pages, result_row[1], # raw_count (result_row[1]/total_pages)*100) elif type is 'domains': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # domain_name result_row[0][1], # org_name result_row[0][2], # org_country total_pages, total_item, (total_item/total_pages)*100) elif type is 'orgs': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # org_country total_pages, total_item, (total_item/total_pages)*100) csv_output.append(csv_row) if current_row >= self.num_results: break # write out csv file_name = type + '-by-tld' # this really only applied to elements at present if sub_type: file_name += '-' + sub_type file_name += '.csv' # store the report self.write_csv(file_name, csv_output) # end get_reports_by_tld def get_results_rows(self, total_pages, type, sub_type = '', tld_filter=''): # this queries the db to get all elements, domains, or orgs # next they are counted to find the most common # and formatted to csv rows and returned # query the db if type is 'elements': # rows are (page.start_uri, org.name, element.element_uri, element.extension, element.type, element_domain.domain) query_results = self.sql_driver.get_elements(tld_filter, sub_type) elif type is 'domains': # rows are (page.start_uri, element_domain.domain, org.name) query_results = self.sql_driver.get_domains(tld_filter) elif type is 'orgs': # row are page.start_uri, org.name query_results = self.sql_driver.get_orgs(tld_filter) else: self.fatal('Type must be elements, domains, or orgs.') # count up the unique elements, domains, or orgs we are looking for results_counter = collections.Counter() for row in query_results: # remove first element in tuple as it is the page uri, now irrelevant # add rest to results counter as this is what we care about now results_counter[row[1:]] += 1 # python's most_common() arbitrarily re-orders items with same value, making # debugging a nightmare, so we have to double sort here # convert to list we can sort results_counter = results_counter.most_common() # sort alphabetical results_counter.sort() # sub-sort on num occurrences results_counter.sort(reverse=True, key=lambda item:item[1]) return results_counter # end get_results_rows def get_page_3p_stats(self, tld = ''): # This function calls get_page_element_domain_pairs to get a list of tuples # st. each tuple is a unique (page address, domain of an element) paring # This list of tuples is then iterated so that we count how many domains # each page is linked to # IMPORTANT: get_page_element_domain_pairs is *already* sorted by page, otherwise # loop below would not work if tld == '*': tld = '' # init vars this_page_element_count = 0 all_page_element_counts = [] last_page = '' # run query, process rows for row in self.sql_driver.get_page_uri_element_domain_pairs(tld): # page has no tackers, count is zero if not row[1]: all_page_element_counts.append(0) # page has trackers, add count else: # this is the same page, increment count if row[0] == last_page: if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count += 1 else: this_page_element_count += 1 # this is a new page, store our running count, reset to 1 # update last_page else: if last_page != '': all_page_element_counts.append(this_page_element_count) last_page = row[0] if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count = 1 else: this_page_element_count = 0 else: this_page_element_count = 1 # if we have an outstanding value, give it an increment if this_page_element_count != 0: # enter the last value into the list all_page_element_counts.append(this_page_element_count) # mean and median should always be ok mean = statistics.mean(all_page_element_counts) median = statistics.median(all_page_element_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(all_page_element_counts) except: mode = 'NULL' return(mean, median, mode) # get_page_3p_stats # end class Reporter
class ParseURI: def __init__(self): # load up the tld list now as only hit it once this way self.pubsuffix_list = self.get_pubsuffix_list() # this is to speed up tlds lookup with hash table # we can share among all runs over time self.sql_driver = MySQLDriver() # check if sub_domain_tld exists, if not create one # this should only really happen once # # bug? # when using pool this can be done several times in tandem # resulting in some lost entries - e.g. two threads see db not exist the first # one to break the tie will create a new blank db and add a new entry # it is possible the slightly slower second thread will also create a blank db # erasing the entry of the first thread, however, after the first round the db # should not be re-created again # # it should be emphasized this db is ONLY used to speed up parsing the tld # (partially because my regex method is slow and should be refactored!) # the data is this db is NOT used for analysis - thus deleting a few records # will have a minor impact on speed for the first few entries, and then be # wholly irrelevant # # still...it irks me that this entire class could be smarter, so I will # fix it another day if self.sql_driver.check_db_exist('sub_domain_tld') == False: self.sql_driver.create_sub_domain_tld_db() else: self.sql_driver.db_switch('sub_domain_tld') # end __init__ def get_pubsuffix_list(self): # get the file from the local dir pubsuffix_raw_list = open(os.path.join(os.path.dirname(__file__), './resources/pubsuffix/wbxrPubSuffixList.txt'), 'r') pubsuffix_list = [] for line in pubsuffix_raw_list: # the last part of the list is random shit we don't care about, so stop reading if re.match("^// ===BEGIN PRIVATE DOMAINS===", line):break # skip lines that are comments or blank, add others to list # also remove leading ., !, and * as it f***s up regex later if not re.match("^//.+$|^$", line): pubsuffix_list.append(re.sub('^[\!\*]\.?', '', line.strip())) # we sort long->short so we can match deeper TLD first (ie, ac.uk *before* .uk) pubsuffix_list.sort(key=len,reverse=True) # to speed things up we move the most common TLD to the front of the line # that said, if it's not one of these we take a MASSIVE performance hit popular_pubsuffixs = ['gov', 'co.uk', 'mil', 'org', 'net', 'edu', 'com'] for popular_pubsuffix in popular_pubsuffixs: pubsuffix_list.remove(popular_pubsuffix) pubsuffix_list.insert(0, popular_pubsuffix) return pubsuffix_list # get_pubsuffix_list def get_domain_pubsuffix_tld(self, uri): # pull out the first chunk of the domain, possibly including subdomains # if this fails, the domain is f****d up (or not https? at least), kill # only handles if the domain is followed by $, ?, \, or / - other shit will break... # adding to handle with port at end, ie blah.com:8080, so rule is '|\:[0-9].+ # adding \.? at the top to handle leading '.' try: sub_domain = re.search('^https?:\/\/\.?(.*?)(\:[0-9].+)?($|\/|\?|\\\\|=)', uri).group(1) except: return('Exception: Unable to parse: '+uri[:50], 'Exception: Unable to parse: '+uri[:50], 'Exception: Unable to parse: '+uri[:50]) # strip off leading or trailing '.', this is f*****g things up sub_domain = re.sub('(^\.|\.$)', '', sub_domain) # see if it is an ip address, if so return it # this is only ipv4 pattern though, maybe should thinking about ipv6? try: re.search('(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$)', sub_domain).group(1) return (sub_domain, 'None', 'None') except: pass # first, we will see if it is in the db already record = self.sql_driver.sub_domain_exists(sub_domain) if record: return record # the pubsuffix list is large -> small, # so the first match is the one we want # after we find it, break out and continue for pubsuffix_try in self.pubsuffix_list: if re.match(".+\."+pubsuffix_try+"$", sub_domain): pubsuffix = pubsuffix_try break # if we didn't find the pubsuffix we fail try: pubsuffix except: return('Exception: Unable to parse: '+uri[:50], 'Exception: Unable to parse: '+uri[:50], 'Exception: Unable to parse: '+uri[:50]) # if we have sub.domain.tld_match, we just want domain.tld_match # there is no reason this should fail if we get this far, but try/except to be safe try: domain = re.search('(.*\.)?\.?(.*\.'+pubsuffix+')$', sub_domain).group(2) except: return('Exception: Unable to parse: '+uri[:50], 'Exception: Unable to parse: '+uri[:50], 'Exception: Unable to parse: '+uri[:50]) # grab the tld off of the pubsuffix # if regex fails the tld and pubsuffix are the same try: tld = re.search('\.([0-9A-Za-z]+)$', pubsuffix).group(1).lower() except: tld = pubsuffix self.sql_driver.add_sub_domain_pubsuffix_tld(sub_domain, domain, pubsuffix, tld) return (domain, pubsuffix, tld) #end get_domain_pubsuffix_tld def close(self): # close mysql connection self.sql_driver.close() return # end close #end ParseURI
class Reporter: def __init__(self, db_name, num_tlds, num_results, tracker_threshold=0): self.db_name = db_name self.sql_driver = MySQLDriver(self.db_name) self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold self.startTime = datetime.now() self.pages_ok_count = self.sql_driver.pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t===========================') print('\t Patching DB with Org Data ') print('\t===========================') # update the domains to their owners self.patch_org_data() print('\t\tSuccess!') print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...') self.top_tlds = self.get_top_tlds(self.num_tlds) print(self.top_tlds) print('\t\tSuccess!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: print('\t\t |- %s (%s)' % (tld, pages)) # SPECIAL SAUCE, FOR EXPERTS: tracker domains! # # idea for this is you set a threshold of the number of sites a given domain # is connected to - domains connecting to many sites may correlate those visits # via referer strings etc, so we call these 'tracker domains' # # on a really large set of sites (e.g. 1M+) this works well but on small samples # (e.g. 500) it doesn't work well at all as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing... # DON'T USE THIS! # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # if tracker_threshold: print('\t=========================') print('\t Getting tracker domains ') print('\t=========================') print('\t\tProcessing...') self.tracker_domains = self.get_tracker_domains( self.tracker_threshold) print('\t\tSuccess!') else: self.tracker_domains = [] # end __init__ ######################### # HELPERS, GENERAL # ######################### def setup_report_dir(self): # create directory for where the reports go if not exist if os.path.exists('./reports') == False: print('\tMaking reporting directory.') os.makedirs('./reports') # set global report_path, trim off the wbxr_ prefix self.report_path = './reports/' + self.db_name[5:] # set up subdir for this database analysis if os.path.exists(self.report_path) == False: print('\tMaking subdirectory for reports.') os.makedirs(self.report_path) # just a notice print('\t\tWriting output to %s' % self.report_path) # setup_report_dir # reuse this a lot def write_csv(self, file_name, csv_rows): full_file_path = self.report_path + '/' + file_name file_out = open(full_file_path, 'w') for row in csv_rows: file_out.write(row) file_out.close() print('\t\t' + '*' * 40) print('\t\tOutput written to %s' % full_file_path) # write_csv # just fyi def print_runtime(self): print('~=' * 40) print("Finished!") print("Time to process: " + str(datetime.now() - self.startTime) + "\n") print('-' * 80) # end print_runtime # X-( def fatal(self, msg): print('FATAL ERROR: %s' % msg) print('EXITING.') exit() # fatal ############################# # HELPERS, DATABASE/INIT # ############################# def patch_org_data(self): # in order to analyze what entities receive user data, we need to update # the database with domain ownership records we have store previously # # we first clear out what is in there in case the new data has changed # perhaps make this optional, on big dbs takes a while # print('\t\tFlushing extant org data...') self.sql_driver.reset_domains_orgs() # next we pull the org/domain pairings from the json file in the resources dir # and add to the db print('\t\tPatching with new org data...') raw_data = open( os.path.join(os.path.dirname(__file__), './resources/org_domains/org_domains.json'), 'r') json_data = json.load(raw_data) # the default id for orgs is 1, so we advance from there id = 1 for item in json_data: id += 1 self.sql_driver.add_org(id, item['organization'], item['notes'], item['country']) for domain in item['domains']: self.sql_driver.update_domain_org(id, domain) # end patch_org_data def get_top_tlds(self, limit, type='tld'): # finds the most common tlds from all the pages # type is default to tld, but pubsuffix also works # have to do some weird sorting b/c python is arbitrary and f***s up diff tests # returns array tlds = [] for row in self.sql_driver.get_all_tlds(type): tlds.append(row[0]) top_tlds = collections.Counter(tlds).most_common() # sort alphabetical top_tlds.sort() # sub-sort on num occurances top_tlds.sort(reverse=True, key=lambda item: item[1]) # cut the array top_tlds = top_tlds[0:limit] # push in wild cards top_tlds.insert(0, ('*', self.pages_ok_count)) return top_tlds # end get_top_tlds def get_tracker_domains(self, threshold=0): # first finds all pairings of page domains and element domains # note this is then unique on SITE, not on PAGE # returns the list of domains which appear on at least the threshold number domains = [] for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs( ): domains.append(page_domain_element_domain[1]) # count up all the pairs, convert to items() so can process as tuples domain_counts = collections.Counter(domains).items() # put the return values here tracker_domains = [] # check against threshold for domain_count in domain_counts: if domain_count[1] >= threshold: tracker_domains.append(domain_count[0]) return tracker_domains # get_tracker_domains ######################### # REPORTS, GENERAL # ######################### def header(self): # just outputs really basic data about how many records in db, etc. # print('\t=================') print('\t General Summary') print('\t=================') output_for_csv = [] #write csv header output_for_csv.append('"Item","Value"\n') total_pages = self.sql_driver.pages_ok_count() print("\t\tTotal Pages OK:\t\t\t%s" % total_pages) output_for_csv.append('"Total Pages OK","%s"\n' % total_pages) total_pages_noload = self.sql_driver.pages_noload_count() total_pages_attempted = total_pages + total_pages_noload print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload) output_for_csv.append('"Total Pages FAIL","%s"\n' % total_pages_noload) print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted) output_for_csv.append('"Total Pages Attempted","%s"\n' % total_pages_attempted) percent_pages_OK = int((total_pages / total_pages_attempted) * 100) print("\t\t%% Pages OK:\t\t\t%s%%" % percent_pages_OK) output_for_csv.append('"%% Pages OK","%s"\n' % percent_pages_OK) total_errors = self.sql_driver.total_errors_count() print("\t\tTotal Errors:\t\t\t%s" % total_errors) output_for_csv.append('"Total Errors","%s"\n' % total_errors) total_cookies = self.sql_driver.total_cookie_count() print("\t\tTotal Cookies:\t\t\t%s" % total_cookies) output_for_csv.append('"Total Cookies","%s"\n' % total_cookies) total_pages_with_cookies = self.sql_driver.pages_w_cookie_count() print("\t\tPages with Cookies:\t\t%s" % total_pages_with_cookies) output_for_csv.append('"Pages with Cookies","%s"\n' % total_pages_with_cookies) percent_with_cookies = (total_pages_with_cookies / total_pages) * 100 print("\t\t%% Pages with Cookies:\t\t%s%%" % int(percent_with_cookies)) output_for_csv.append('"%% Pages with Cookies","%s"\n' % int(percent_with_cookies)) total_elements = self.sql_driver.total_element_count() print("\t\tTotal Elements Requested:\t%s" % total_elements) output_for_csv.append('"Total Elements Requested","%s"\n' % total_elements) total_elements_received = self.sql_driver.total_element_count( received=True) print("\t\tTotal Elements Received:\t%s" % total_elements_received) output_for_csv.append('"Total Elements Received","%s"\n' % total_elements_received) percent_element_received = int( (total_elements_received / total_elements) * 100) print('\t\t%% Elements Received:\t\t%s%%' % percent_element_received) output_for_csv.append('"%% Elements Received", "%s"\n' % percent_element_received) total_pages_with_elements = self.sql_driver.pages_w_element_count() print("\t\tPages with Elements:\t\t%s" % total_pages_with_elements) output_for_csv.append('"Pages with Elements","%s"\n' % total_pages_with_elements) percent_with_elements = (total_pages_with_elements / total_pages) * 100 print("\t\t%% Pages with Elements:\t\t%s%%" % int(percent_with_elements)) output_for_csv.append('"%% Pages With Elements","%s"\n' % int(percent_with_elements)) self.write_csv('db_summary.csv', output_for_csv) print('\t' + '-' * 80 + '\n') # header def get_network_ties(self): print('\t=============================') print('\t Processing Network Ties ') print('\t=============================') output_for_csv = [] # can also include the page_org in the report, but commented out for now # at a later date this could be an option # output_for_csv.append('"page_domain","page_org","3p_domain","3p_org"\n') output_for_csv.append('"page_domain","3p_domain","3p_org"\n') for edge in self.sql_driver.get_network_ties(): # output_for_csv.append('"%s","%s","%s","%s"\n' % (edge[0],edge[1],edge[2],edge[3])) output_for_csv.append('"%s","%s","%s",\n' % (edge[0], edge[2], edge[3])) self.write_csv('network.csv', output_for_csv) print('\t' + '-' * 80 + '\n') # get_network_ties def get_summary_by_tld(self): print('\t=============================') print('\t Processing Summaries by TLD ') print('\t=============================') output_for_csv = [] output_for_csv.append( '"TLD","N","% TOTAL","N W/3PE","% W/3PE","N W/COOKIE","% W/COOKIE","N W/JS","% W/JS","3P DOMAIN MEAN","3P DOMAIN MEDIAN","3P DOMAIN MODE" \n' ) # now do per-tld numbers for tld in self.top_tlds: print('\t\tGetting summary for %s' % tld[0]) if tld[0] != '*': tld_filter = tld[0] else: tld_filter = '' total_pages = self.sql_driver.get_complex_page_count(tld_filter) total_pages_percent = (total_pages / self.pages_ok_count) * 100 total_pages_elements = self.sql_driver.get_complex_page_count( tld_filter, 'elements', self.tracker_domains) percent_with_elements = (total_pages_elements / total_pages) * 100 total_pages_cookies = self.sql_driver.get_complex_page_count( tld_filter, 'cookies', self.tracker_domains) percent_with_cookies = (total_pages_cookies / total_pages) * 100 total_pages_js = self.sql_driver.get_complex_page_count( tld_filter, 'javascript', self.tracker_domains) percent_with_js = (total_pages_js / total_pages) * 100 stats = self.get_page_3p_stats(tld[0]) mean = stats[0] median = stats[1] mode = stats[2] output_for_csv.append( '"%s","%s","%.2f","%s","%.2f","%s","%.2f","%s","%.2f","%.2f","%s","%s"\n' % (tld[0], total_pages, total_pages_percent, total_pages_elements, percent_with_elements, total_pages_cookies, percent_with_cookies, total_pages_js, percent_with_js, mean, median, mode)) self.write_csv('summary_by_tld.csv', output_for_csv) # end get_summary_by_tld ##################### # REPORTS, MAIN # ##################### def get_reports_by_tld(self, type='', sub_type=''): print('\t=============================') print('\tProcessing Top %s %s %s' % (self.num_results, type, sub_type)) print('\t=============================') # keep output here csv_output = [] # write out the header row for the csv if type is 'elements': csv_output.append( '"TLD","TLD Rank","Intra-TLD Rank","Organization","Country", "Element","Extension","Type","Domain","Total Pages","Raw Count","Percent Total"\n' ) elif type is 'domains': csv_output.append( '"TLD","TLD Rank","Intra-TLD Rank","Domain","Organization","Country","Number of Pages","Raw Count","Percent Total"\n' ) elif type is 'orgs': csv_output.append( '"TLD","TLD Rank","Intra-TLD Rank","Organization","Country","Number of Pages","Raw Count","Percent Total"\n' ) else: self.fatal( 'Wrong type specified in get_reports_by_tld, must be "elements", "domains", or "orgs".' ) tld_count = 0 for tld in self.top_tlds: current_tld = tld[0] total_pages = tld[1] print('\t\tcurrently on: ' + current_tld) # filter on current page tld if tld[0] != '*': tld_filter = tld[0] tld_count += 1 else: tld_filter = '' # get results with specified filter results_rows = self.get_results_rows(total_pages, type, sub_type, tld_filter) current_row = 0 # loop results for result_row in results_rows: current_row += 1 if type is 'elements': csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # country result_row[0][2], # element result_row[0][3], # extension result_row[0][4], # type result_row[0][5], # domain total_pages, result_row[1], # raw_count (result_row[1] / total_pages) * 100) elif type is 'domains': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # domain_name result_row[0][1], # org_name result_row[0][2], # org_country total_pages, total_item, (total_item / total_pages) * 100) elif type is 'orgs': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # org_country total_pages, total_item, (total_item / total_pages) * 100) csv_output.append(csv_row) if current_row >= self.num_results: break # write out csv file_name = type + '-by-tld' # this really only applied to elements at present if sub_type: file_name += '-' + sub_type file_name += '.csv' # store the report self.write_csv(file_name, csv_output) # end get_reports_by_tld def get_results_rows(self, total_pages, type, sub_type='', tld_filter=''): # this queries the db to get all elements, domains, or orgs # next they are counted to find the most common # and formatted to csv rows and returned # query the db if type is 'elements': # rows are (page.start_uri, org.name, element.element_uri, element.extension, element.type, element_domain.domain) query_results = self.sql_driver.get_elements(tld_filter, sub_type) elif type is 'domains': # rows are (page.start_uri, element_domain.domain, org.name) query_results = self.sql_driver.get_domains(tld_filter) elif type is 'orgs': # row are page.start_uri, org.name query_results = self.sql_driver.get_orgs(tld_filter) else: self.fatal('Type must be elements, domains, or orgs.') # count up the unique elements, domains, or orgs we are looking for results_counter = collections.Counter() for row in query_results: # remove first element in tuple as it is the page uri, now irrelevant # add rest to results counter as this is what we care about now results_counter[row[1:]] += 1 # python's most_common() arbitrarily re-orders items with same value, making # debugging a nightmare, so we have to double sort here # convert to list we can sort results_counter = results_counter.most_common() # sort alphabetical results_counter.sort() # sub-sort on num occurrences results_counter.sort(reverse=True, key=lambda item: item[1]) return results_counter # end get_results_rows def get_page_3p_stats(self, tld=''): # This function calls get_page_element_domain_pairs to get a list of tuples # st. each tuple is a unique (page address, domain of an element) paring # This list of tuples is then iterated so that we count how many domains # each page is linked to # IMPORTANT: get_page_element_domain_pairs is *already* sorted by page, otherwise # loop below would not work if tld == '*': tld = '' # init vars this_page_element_count = 0 all_page_element_counts = [] last_page = '' # run query, process rows for row in self.sql_driver.get_page_uri_element_domain_pairs(tld): # page has no tackers, count is zero if not row[1]: all_page_element_counts.append(0) # page has trackers, add count else: # this is the same page, increment count if row[0] == last_page: if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count += 1 else: this_page_element_count += 1 # this is a new page, store our running count, reset to 1 # update last_page else: if last_page != '': all_page_element_counts.append(this_page_element_count) last_page = row[0] if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count = 1 else: this_page_element_count = 0 else: this_page_element_count = 1 # if we have an outstanding value, give it an increment if this_page_element_count != 0: # enter the last value into the list all_page_element_counts.append(this_page_element_count) # mean and median should always be ok mean = statistics.mean(all_page_element_counts) median = statistics.median(all_page_element_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(all_page_element_counts) except: mode = 'NULL' return (mean, median, mode) # get_page_3p_stats # end class Reporter
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem we log the error if browser_output['success'] == False: print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result'])) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return else: # no error, treat result as browser output browser_output = browser_output['result'] # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def run(self, pool_size): try: uri_list = open('./page_lists/'+self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() sql_driver = MySQLDriver(self.db_name) # sort out what uris we are processing from the list uris_to_process = [] count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for uri in uri_list: # skip lines that are comments if "#" in uri[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', uri)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the uri, converting the domain to idna, # and pasting it all back together. ugly. parsed_uri = urlsplit(uri.strip()) uri = parsed_uri[0] + "://" uri += parsed_uri[1].encode('idna').decode('utf-8') # if chunks exist glue them back together if len(parsed_uri[2]) != 0: uri += parsed_uri[2] if len(parsed_uri[3]) != 0: uri += '?' + parsed_uri[3] if len(parsed_uri[4]) != 0: uri += '#' + parsed_uri[4] # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50])) continue # skip if in db already if sql_driver.page_exists(uri): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50])) continue # only add if not in list already if uri not in uris_to_process: print("\t\t%s | %-50s Adding." % (count, uri[:50])) uris_to_process.append(uri) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50])) print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(uris_to_process)) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') myPool = Pool(pool_size) myPool.map(self.process_uri, uris_to_process)
def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # figure out the privacy policy url and text, starts null priv_policy_url = None priv_policy_url_text = None # read in our list of privacy link terms from the json file in webxray/resources/policyxray privacy_policy_term_list = self.utilities.get_privacy_policy_term_list( ) # we reverse links return from browser to check footer links first as that is where policy links tend to be all_links = browser_output['all_links'] all_links.reverse() # if we have links search for privacy policy if len(all_links) > 0: # links are tuple for link_text, link_url in all_links: # makes sure we have text, skip links without if link_text: # need lower for string matching link_text = link_text.lower().strip() # not a link we can use if 'javascript' in link_text: continue # see if the link_text is in our term list if link_text in privacy_policy_term_list: # if the link_url is relative this will convert to absolute priv_policy_url = self.utilities.get_absolute_url_from_page_link( url, link_url) priv_policy_url_text = link_text break # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: # handles issue where postgres will crash on inserting null character source = browser_output['source'].replace('\x00', ' ') else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], priv_policy_url, priv_policy_url_text, page_is_ssl, source, browser_output['load_time'], page_domain_id) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( 'http://' + cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error( url, 'Error parsing cookie with domain: ' + cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie(page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error( url, 'Error parsing element request: ' + request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request][ 'received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request][ 'referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[ 2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error( url, 'Error parsing referer header: ' + referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][ request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request][ 'load_time'] except: load_time = None try: status = browser_output['processed_requests'][request][ 'status'] except: status = None try: status_text = browser_output['processed_requests'][request][ 'status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request][ 'content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request][ 'body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'] [request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'] [request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = [ 'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf' ] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = [ 'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi' ] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None # file hashing has non-trivial overhead and off by default # # what this does is uses the same ua/referer as the actual request # so we are just replaying the last one to get similar response # note that we aren't sending the same cookies so that could be an issue # otherwise it is equivalent to a page refresh in theory # option to hash only 3p elements observed here if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False): replay_element_request = urllib.request.Request( request, headers={ 'User-Agent': browser_output['processed_requests'][request] ['user_agent'], 'Referer': referer, 'Accept': '*/*' }) try: file_md5 = hashlib.md5( urllib.request.urlopen(replay_element_request, timeout=10).read()).hexdigest() except: file_md5 = None else: file_md5 = None # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, file_md5, element_extension, element_type, element_args, element_domain_id) # close db connection sql_driver.close() return True