def gen_url_list(stop, start=1, get_rank=False, filename="", sep=','): """Yield URLs for a given rank range from a given file (or default Alexa list). start and stop is inclusive and 1-based indexes (to match the ranks). """ if not filename: filename = ALEXA_TOP1M_PATH if not ospath.isfile(filename): wl_log.critical('Cannot find URL list (Top Alexa CSV etc.) file!') return for line in open(filename).readlines()[start - 1:stop]: if sep in line: rank, site_url = line.split( sep, 1) # we expect a comma between rank and URL (Alexa format) #beware: URLs may also include commas. site_url = site_url.rstrip() if get_rank: # if caller asked for rank yield int(rank), site_url else: yield site_url else: if get_rank: yield 0, line.rstrip( ) # we couldn't find the rank, just send 0 else: yield line.rstrip() # no comma
def mysql_init_db(db_name='fp_detective'): db_conn = None try: db_conn = mdb.connect(DB_IP_ADDRESS, DB_USERNAME, DB_PASSWD, db_name) except mdb.Error, e: wl_log.critical("Error %d: %s" % (e.args[0], e.args[1])) raise e
def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id']) except Exception as exc: wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
def mysql_init_db(db_name='fp_detective'): db_conn = None try: db_conn = mdb.connect(DB_IP_ADDRESS, DB_USERNAME, DB_PASSWD, db_name); except mdb.Error, e: wl_log.critical("Error %d: %s" % (e.args[0],e.args[1])) raise e
def gen_url_list(stop, start=1, get_rank=False, filename = "", sep=','): """Yield URLs for a given rank range from a given file (or default Alexa list). start and stop is inclusive and 1-based indexes (to match the ranks). """ if not filename: filename = ALEXA_TOP1M_PATH if not ospath.isfile(filename): wl_log.critical('Cannot find URL list (Top Alexa CSV etc.) file!') return for line in open(filename).readlines()[start-1:stop]: if sep in line: rank, site_url = line.split(sep, 1) # we expect a comma between rank and URL (Alexa format) #beware: URLs may also include commas. site_url = site_url.rstrip() if get_rank: # if caller asked for rank yield int(rank), site_url else: yield site_url else: if get_rank: yield 0, line.rstrip() # we couldn't find the rank, just send 0 else: yield line.rstrip() # no comma
def launch_tor_service(self, logfile='/dev/null'): """Launch Tor service and return the process.""" self.log_file = logfile self.tmp_tor_data_dir = ut.clone_dir_with_timestap( cm.get_tor_data_path(self.tbb_version)) self.torrc_dict.update({ 'DataDirectory': self.tmp_tor_data_dir, 'Log': ['INFO file %s' % logfile] }) wl_log.debug("Tor config: %s" % self.torrc_dict) try: self.tor_process = stem.process.launch_tor_with_config( config=self.torrc_dict, init_msg_handler=self.tor_log_handler, tor_cmd=cm.get_tor_bin_path(self.tbb_version), timeout=270) self.controller = Controller.from_port() self.controller.authenticate() return self.tor_process except stem.SocketError as exc: wl_log.critical("Unable to connect to tor on port %s: %s" % (cm.SOCKS_PORT, exc)) sys.exit(1) except: # most of the time this is due to another instance of # tor running on the system wl_log.critical("Error launching Tor", exc_info=True) sys.exit(1) wl_log.info("Tor running at port {0} & controller port {1}.".format( cm.SOCKS_PORT, cm.CONTROLLER_PORT)) return self.tor_process
def is_targz_archive_corrupt(arc_path): # http://stackoverflow.com/a/2001749/3104416 tar_gz_check_cmd = "gunzip -c %s | tar t > /dev/null" % arc_path tar_status, tar_txt = commands.getstatusoutput(tar_gz_check_cmd) if tar_status: wl_log.critical("Tar check failed: %s tar_status: %s tar_txt: %s" % (tar_gz_check_cmd, tar_status, tar_txt)) return tar_status return False # no error
def init_mitmproxy(basename, timeout, logging): try: port, pid = run_mitmdump(basename, timeout+1, logging) # runs a mitmdump process with the timeout+1 sec except: wl_log.critical('Exception initializing mitmdump') else: wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid)) return "127.0.0.1:%s " % port if port and pid else ""
def insert_to_db(db_conn, query, args): with closing(db_conn.cursor(mdb.cursors.DictCursor)) as db_cursor: #db_cursor = db_conn.cursor(mdb.cursors.DictCursor) try: db_cursor.execute(query, args) except Exception as ex: wl_log.critical('Exception executing query: %s %s' % (query, args)) raise ex db_conn.commit() return db_cursor.lastrowid
def update_crawl_time(db_conn, crawl_id): with closing( db_conn.cursor(mdb.cursors.DictCursor) ) as db_cursor: try: db_cursor.execute("UPDATE crawl_job SET finish_time= %s WHERE crawl_id = %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), crawl_id)) except Exception as ex: wl_log.critical('Exception executing UPDATE query: %s %s') raise ex db_conn.commit() return db_cursor.lastrowid
def insert_to_db(db_conn, query, args): with closing( db_conn.cursor(mdb.cursors.DictCursor) ) as db_cursor: #db_cursor = db_conn.cursor(mdb.cursors.DictCursor) try: db_cursor.execute(query, args) except Exception as ex: wl_log.critical('Exception executing query: %s %s' % (query, args)) raise ex db_conn.commit() return db_cursor.lastrowid
def crawl(self, num_batches=cm.NUM_BATCHES, num_instances=cm.NUM_INSTANCES, start_line=0): wl_log.info('Crawl configuration: batches %s, instances: %s, tbb_version %s, no of URLs: %s, crawl dir: %s, XVFB: %s, screenshot: %s' % (num_batches, num_instances, self.tbb_version, len(self.urls), self.crawl_dir, self.xvfb, self.capture_screen)) # for each batch for batch_num in range(num_batches): wl_log.info('********** Starting batch %s **********' % batch_num) site_num = start_line bg_site = None batch_dir = ut.create_dir( os.path.join(self.crawl_dir, str(batch_num))) # init/reset tor process to have a different circuit. # make sure that we're not using the same guard node again wl_log.info('********** Restarting Tor Before Batch **********') self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 # for each site for page_url in self.urls: sites_crawled_with_same_proc += 1 if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS: wl_log.info('********** Restarting Tor Process **********') self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 wl_log.info('********** Crawling %s **********' % page_url) page_url = page_url[:cm.MAX_FNAME_LENGTH] site_dir = ut.create_dir(os.path.join( batch_dir, ut.get_filename_from_url(page_url, site_num))) for instance_num in range(num_instances): wl_log.info('********** Visit #%s to %s **********' % (instance_num, page_url)) self.visit = None try: self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir, self.tbb_version, self.tor_controller, bg_site, self.xvfb, self.capture_screen) self.visit.get() except KeyboardInterrupt: # CTRL + C raise KeyboardInterrupt except (ut.TimeExceededError, TimeoutException) as exc: wl_log.critical('Visit to %s timed out! %s %s' % ( page_url, exc, type(exc))) if self.visit: self.visit.cleanup_visit() except Exception: wl_log.critical('Exception crawling %s' % page_url, exc_info=True) if self.visit: self.visit.cleanup_visit() # END - for each visit site_num += 1 time.sleep(cm.PAUSE_BETWEEN_SITES)
def init_mitmproxy(basename, timeout, logging): try: port, pid = run_mitmdump( basename, timeout + 1, logging) # runs a mitmdump process with the timeout+1 sec except: wl_log.critical('Exception initializing mitmdump') else: wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid)) return "127.0.0.1:%s " % port if port and pid else ""
def crawl_urls(br_type, urls, fn=lambda x: x): for url in urls: try: br = init_browser(br_type) except: wl_log.critical('Init browser') else: try: crawl_url(br, url, fn) except Exception as e: wl_log.error("Error crawling %s: %s" % (url, e)) br.quit()
def crawl_urls(br_type, urls, fn=lambda x:x): for url in urls: try: br = init_browser(br_type) except: wl_log.critical('Init browser') else: try: crawl_url(br, url, fn) except Exception as e: wl_log.error("Error crawling %s: %s" %(url, e)) br.quit()
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename + '.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker( msg, crawl_id ) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as exc: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int( os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def close_all_streams(self): """Close all streams of a controller.""" wl_log.debug("Closing all streams") try: ut.timeout(cm.STREAM_CLOSE_TIMEOUT) for stream in self.controller.get_streams(): wl_log.debug( "Closing stream %s %s %s " % (stream.id, stream.purpose, stream.target_address)) self.controller.close_stream(stream.id) # MISC reason except ut.TimeExceededError: wl_log.critical("Closing streams timed out!") except: wl_log.debug("Exception closing stream") finally: ut.cancel_timeout()
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename +'.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as _: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def get_free_port(): """Get a free port number for mitmdump. http://stackoverflow.com/questions/1365265/on-localhost-how-to-pick-a-free-port-number?#answer-1365284 """ max_tries = 0 while max_tries < MITM_MAX_TRIES: max_tries += 1 try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('', 0)) port = s.getsockname()[1] except Exception as ex: wl_log.critical('Exception when trying to bind to socket %s ' % ex) sleep(1) else: return port return None
def pack_crawl_data(crawl_dir): """Compress the crawl dir into a tar archive.""" if not os.path.isdir(crawl_dir): wl_log.critical("Cannot find the crawl dir: %s" % crawl_dir) return False if crawl_dir.endswith(os.path.sep): crawl_dir = crawl_dir[:-1] crawl_name = os.path.basename(crawl_dir) containing_dir = os.path.dirname(crawl_dir) os.chdir(containing_dir) arc_path = "%s.tar.gz" % crawl_name tar_cmd = "tar czvf %s %s" % (arc_path, crawl_name) wl_log.debug("Packing the crawl dir with cmd: %s" % tar_cmd) status, txt = commands.getstatusoutput(tar_cmd) if status or is_targz_archive_corrupt(arc_path): wl_log.critical("Tar command failed or archive is corrupt:\ %s \nSt: %s txt: %s" % (tar_cmd, status, txt)) return False else: return True
def crawl_url(crawler_type, page_url, proxy_opt): if 'clicker' in crawler_type: worker = click_crawler else: worker = lazy_crawler br = init_browser('chrome', ['--allow-running-insecure-content', '--ignore-certificate-errors', '--disk-cache-size=0', \ '--enable-logging', '--v=1', "--proxy-server=%s" % proxy_opt]) if not page_url.startswith('http') and not page_url.startswith('file:'): page_url = 'http://' + page_url wl_log.info('***Will crawl %s***' % page_url) try: ut.timeout(CRAWLER_CLICKER_VISIT_TIMEOUT) worker(br, page_url) # run the worker function except ut.TimeExceededError as texc: wl_log.critical('***CRAWLER_CLICKER_VISIT_TIMEOUT at %s (%s)' % (page_url, texc)) finally: br.quit()
def get_public_suffix(self, url): try: return self.psl.get_public_suffix(urlparse(url).hostname) except Exception as e: wl_log.critical('Exception(%s) parsing url: %s' %(e, url)) return ''
def die(last_words): """Log last words and exit.""" wl_log.critical(last_words) sys.exit(1)
def parse_crawl_log(filename, dump_fun=None, crawl_id=0): """Populate domain info object by parsing crawl log file of a site. Call dump function to output dump log. Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. See, fontconfig library for details. """ origins_to_fonts = {} # will keep origin to loaded fonts mapping domaInfo = DomainInfo() file_content = fu.read_file(filename) # TODO chromium? url_match = re.search(r"opening url: ([^,]*)", file_content) url = url_match.group(1) if url_match else filename wl_log.info('Parsing log for %s %s' % (url, filename)) fonts_by_fc_debug = re.findall( r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE ) # match family field of font request (not the matched one) domaInfo.num_offsetWidth_calls = len( re.findall(r"Element::offsetWidth", file_content)) # offset width attempts domaInfo.num_offsetHeight_calls = len( re.findall(r"Element::offsetHeight", file_content)) # offset height attempts # TODO add getBoundingClientRect font_and_urls = re.findall(r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content) # output from modified browser #print 'font_and_urls', font_and_urls font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content) # output from modified browser #print 'font_and_urls', font_and_urls domaInfo.log_complete = int( bool(re.findall(r"Finished all steps!", file_content))) # output from modified browser #print 'domaInfo.log_complete', domaInfo.log_complete js_log_prefix = ">>>FPLOG" fpd_logs = re.findall(r'%s.*' % js_log_prefix, file_content) # output from modified browser domaInfo.fpd_logs = [ call[len(js_log_prefix) + 1:] for call in set(fpd_logs) ] for font_name, font_url in font_and_urls: if font_url.startswith('http') and len( font_name) > 1 and not font_name[:5] in ('data:', 'http:', 'https'): #font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line font_name = font_name.lower().strip() # origin = pub_suffix.get_public_suffix(font_url)\ origin = font_url if origin in origins_to_fonts: origins_to_fonts[origin].add(font_name) #print 'added', font_name, 'to', origin, origins_to_fonts[origin] else: origins_to_fonts[origin] = set([ font_name, ]) for font, face in font_face_pairs: font = font.lower().strip() face = face.lower().strip() # replace all occurrences of this font-family name with the face for fonts_by_origin in origins_to_fonts.itervalues(): try: fonts_by_origin.remove(font) except: # we cannot find this font in this origin's list pass else: fonts_by_origin.add(face) # print 'removed', font, 'added', face for origin, fonts in origins_to_fonts.iteritems(): domaInfo.fonts_by_origins[origin] = list(fonts) domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin] domaInfo.fc_dbg_font_loads = list(set([font.lower() for font in fonts_by_fc_debug \ if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts domaInfo.fonts_loaded = list(set([font.lower() for font in domaInfo.fonts_loaded \ if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE) if not requests and filename.endswith(MITM_LOG_EXTENSION): requests = re.findall(r"(http.*)", file_content, re.MULTILINE) responses = '' # populate domain info obj domaInfo.num_font_loads = len(domaInfo.fonts_loaded) domaInfo.requests = list(set(requests)) domaInfo.responses = list(set(responses)) domaInfo.fp_detected = get_fp_from_reqs(requests) domaInfo.url = url domaInfo.rank = get_rank_domain_from_filename( filename )[0] # !!! rank may not be right. It's only true if we make top Alexa crawl. domaInfo.log_filename = filename domaInfo.crawl_id = crawl_id if dump_fun: # call dump function try: dump_fun(domaInfo) except KeyboardInterrupt: raise except Exception as exc: wl_log.critical("Exception while dumping %s: %s" % (domaInfo.url, exc))
def get_public_suffix(self, url): try: return self.psl.get_public_suffix(urlparse(url).hostname) except Exception as e: wl_log.critical('Exception(%s) parsing url: %s' % (e, url)) return ''
def crawl(self, num_batches=cm.NUM_BATCHES, num_instances=cm.NUM_INSTANCES, start_line=0): wl_log.info("Crawl configuration: batches: %s, instances: %s," " tbb_version: %s, experiment: %s, no of URLs: %s, " "crawl dir: %s, XVFB: %s, screenshot: %s" % (num_batches, num_instances, self.tbb_version, self.experiment, len(self.urls), self.crawl_dir, self.xvfb, self.capture_screen)) # for each batch for batch_num in xrange(num_batches): wl_log.info("********** Starting batch %s **********" % batch_num) site_num = start_line bg_site = None batch_dir = ut.create_dir(os.path.join(self.crawl_dir, str(batch_num))) # init/reset tor process to have a different circuit. # make sure that we're not using the same guard node again wl_log.info("********** Restarting Tor Before Batch **********") self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 # for each site for page_url in self.urls: sites_crawled_with_same_proc += 1 if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS: wl_log.info("********** Restarting Tor Process **********") self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 wl_log.info("********** Crawling %s **********" % page_url) page_url = page_url[:cm.MAX_FNAME_LENGTH] site_dir = ut.create_dir(os.path.join( batch_dir, ut.get_filename_from_url(page_url, site_num))) if self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: bg_site = choice(self.urls) # for each visit for instance_num in range(num_instances): wl_log.info("********** Visit #%s to %s **********" % (instance_num, page_url)) self.visit = None try: self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir, self.tor_controller, bg_site, self.experiment, self.xvfb, self.capture_screen) self.visit.get() except KeyboardInterrupt: # CTRL + C raise KeyboardInterrupt except (ut.TimeExceededError, TimeoutException) as exc: wl_log.critical("Visit to %s timed out! %s %s" % (page_url, exc, type(exc))) if self.visit: self.visit.cleanup_visit() except Exception: wl_log.critical("Exception crawling %s" % page_url, exc_info=True) if self.visit: self.visit.cleanup_visit() # END - for each visit site_num += 1 time.sleep(cm.PAUSE_BETWEEN_SITES)
def parse_crawl_log(filename, dump_fun=None, crawl_id=0): """Populate domain info object by parsing crawl log file of a site. Call dump function to output dump log. Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. See, fontconfig library for details. """ origins_to_fonts = {} # will keep origin to loaded fonts mapping domaInfo = DomainInfo() file_content = fu.read_file(filename) # TODO chromium? url_match = re.search(r"opening url: ([^,]*)", file_content) url = url_match.group(1) if url_match else filename wl_log.info('Parsing log for %s %s' % (url, filename)) fonts_by_fc_debug = re.findall(r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE) # match family field of font request (not the matched one) domaInfo.num_offsetWidth_calls = len(re.findall(r"Element::offsetWidth", file_content)) # offset width attempts domaInfo.num_offsetHeight_calls = len(re.findall(r"Element::offsetHeight", file_content)) # offset height attempts # TODO add getBoundingClientRect font_and_urls = re.findall(r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content) # output from modified browser #print 'font_and_urls', font_and_urls font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content) # output from modified browser #print 'font_and_urls', font_and_urls domaInfo.log_complete = int(bool(re.findall(r"Finished all steps!", file_content))) # output from modified browser #print 'domaInfo.log_complete', domaInfo.log_complete js_log_prefix = ">>>FPLOG" fpd_logs = re.findall(r'%s.*' % js_log_prefix, file_content) # output from modified browser domaInfo.fpd_logs = [call[len(js_log_prefix)+1:] for call in set(fpd_logs)] for font_name, font_url in font_and_urls: if font_url.startswith('http') and len(font_name) > 1 and not font_name[:5] in ('data:', 'http:', 'https'): #font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line font_name = font_name.lower().strip() # origin = pub_suffix.get_public_suffix(font_url)\ origin = font_url if origin in origins_to_fonts: origins_to_fonts[origin].add(font_name) #print 'added', font_name, 'to', origin, origins_to_fonts[origin] else: origins_to_fonts[origin] = set([font_name,]) for font, face in font_face_pairs: font = font.lower().strip() face = face.lower().strip() # replace all occurrences of this font-family name with the face for fonts_by_origin in origins_to_fonts.itervalues(): try: fonts_by_origin.remove(font) except: # we cannot find this font in this origin's list pass else: fonts_by_origin.add(face) # print 'removed', font, 'added', face for origin, fonts in origins_to_fonts.iteritems(): domaInfo.fonts_by_origins[origin] = list(fonts) domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin] domaInfo.fc_dbg_font_loads = list(set([font.lower() for font in fonts_by_fc_debug \ if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts domaInfo.fonts_loaded = list(set([font.lower() for font in domaInfo.fonts_loaded \ if not font[:5] in ('data:', 'http:', 'https')])) # filter out the data urls and web fonts requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE) if not requests and filename.endswith(MITM_LOG_EXTENSION): requests = re.findall(r"(http.*)", file_content, re.MULTILINE) responses = '' # populate domain info obj domaInfo.num_font_loads = len(domaInfo.fonts_loaded) domaInfo.requests = list(set(requests)) domaInfo.responses = list(set(responses)) domaInfo.fp_detected = get_fp_from_reqs(requests) domaInfo.url = url domaInfo.rank = get_rank_domain_from_filename(filename)[0] # !!! rank may not be right. It's only true if we make top Alexa crawl. domaInfo.log_filename = filename domaInfo.crawl_id = crawl_id if dump_fun: # call dump function try: dump_fun(domaInfo) except KeyboardInterrupt: raise except Exception as exc: wl_log.critical("Exception while dumping %s: %s" % (domaInfo.url, exc))