def setUp(self): self.dirs_to_remove = [] self.db_conn = dbu.mysql_init_db('fp_detective_test') self.domainInfo = lp.DomainInfo() # create a new DomainInfo obj for tests self.domainInfo.rank = 1 self.domainInfo.log_filename = '/var/log/syslog' self.domainInfo.url = 'http://google.com' self.domainInfo.fonts_loaded = ['Arial', 'Tahoma', 'Georgia', '微软雅黑'] self.domainInfo.fonts_by_origins = {'http://google.com':['arial', 'Tahoma'], 'http://yahoo.com':['Georgia'] } self.domainInfo.requests = ['http://google.com', 'http://yahoo.com'] self.domainInfo.responses = ['http://abc.com', 'http://xyz.com'] self.domainInfo.num_font_loads = 50 self.domainInfo.num_offsetWidth_calls = 15 self.domainInfo.num_offsetHeight_calls = 15 self.domainInfo.fp_detected = ['iesnare', 'bluecava'] self.domainInfo.crawl_id = 64654 self.domainInfo.fpd_logs = ['userAgent', 'appCodeName'] self.domainInfo.fc_dbg_font_loads = ['Arial', 'Tahoma', 'Georgia', 'someotherfont', '微软雅黑'] self.domainInfo.log_complete = 1 ha = ag.HeadlessAgent() self.crawl_job = ag.CrawlJob(ha) self.dirs_to_remove.append(self.crawl_job.job_dir) self.crawl_job.urls = ['http://google.com', 'http://yahoo.com'] self.crawl_job.desc
def should_crawl_and_find_swfs(self, url, expected_strings=(None, None), unexpected_strings=(None, None), crawler_type='chrome_lazy'): expected_fields, expected_values = expected_strings unexpected_fields, unexpected_values = unexpected_strings crawl_id = ag.crawl_sites([(1, url),], crawler_type, num_crawl_urls=1) db_conn = dbu.mysql_init_db() db_curs = db_conn.cursor(mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs) if expected_fields: found_dict = {} for field in expected_fields: found_dict[field] = False for row in rows: if expected_fields: for expected_field, expected_value in zip(expected_fields, expected_values): if expected_value in row[expected_field]: found_dict[expected_field] = True print 'found in ', row[expected_field] if unexpected_values: for unexpected_field, unexpected_value in zip(unexpected_fields, unexpected_values) : if unexpected_value in row[unexpected_field]: self.fail('Unexpected field %s with unexpected value %s found' %(unexpected_field, unexpected_value)) if expected_fields: for field, found in found_dict.iteritems(): if not found: self.fail('Cannot find %s' % field) db_curs.close() db_conn.close()
def setUp(self): self.dirs_to_remove = [] self.db_conn = dbu.mysql_init_db('fp_detective_test') self.domainInfo = lp.DomainInfo() # create a new DomainInfo obj for tests self.domainInfo.rank = 1 self.domainInfo.log_filename = '/var/log/syslog' self.domainInfo.url = 'http://google.com' self.domainInfo.fonts_loaded = ['Arial', 'Tahoma', 'Georgia', '微软雅黑'] self.domainInfo.fonts_by_origins = {'http://google.com':['arial', 'Tahoma'], 'http://yahoo.com':['Georgia'] } self.domainInfo.requests = ['http://google.com', 'http://yahoo.com'] self.domainInfo.responses = ['http://abc.com', 'http://xyz.com'] self.domainInfo.num_font_loads = 50 self.domainInfo.num_offsetWidth_calls = 15 self.domainInfo.num_offsetHeight_calls = 15 self.domainInfo.fp_detected = [fpr.FINGERPRINTER_REGEX.items()[:2]] self.domainInfo.crawl_id = 64654 self.domainInfo.fpd_logs = ['userAgent', 'appCodeName'] self.domainInfo.fc_dbg_font_loads = ['Arial', 'Tahoma', 'Georgia', 'someotherfont', '微软雅黑'] self.domainInfo.log_complete = 1 ha = ag.HeadlessAgent() self.crawl_job = ag.CrawlJob(ha) self.dirs_to_remove.append(self.crawl_job.job_dir) self.crawl_job.urls = ['http://google.com', 'http://yahoo.com'] self.crawl_job.desc
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename + '.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker( msg, crawl_id ) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as exc: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int( os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def __init__(self, agent): self.job_dir = create_job_folder() self.desc = '' self.urls = [] self.url_tuples = [] self.num_crawl_urls = 0 self.max_parallel_procs = MAX_PARALLEL_PROCESSES self.crawl_agent = agent self.crawl_agent.crawl_job = self self.crawl_agent.job_dir = self.job_dir # for passing to multiprocessing worker - should find a better way self.index_html_log = os.path.join(self.crawl_agent.job_dir, 'index.html') self.db_conn = dbu.mysql_init_db('fp_detective') self.crawl_id = dbu.add_crawl_job_to_db(self, self.db_conn) self.crawl_agent.crawl_id = self.crawl_id
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename +'.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as _: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def should_crawl_and_find_swfs(self, url, expected_strings=(None, None), unexpected_strings=(None, None), crawler_type='chrome_lazy'): expected_fields, expected_values = expected_strings unexpected_fields, unexpected_values = unexpected_strings crawl_id = ag.crawl_sites([ (1, url), ], crawler_type, num_crawl_urls=1) db_conn = dbu.mysql_init_db() db_curs = db_conn.cursor(mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs) if expected_fields: found_dict = {} for field in expected_fields: found_dict[field] = False for row in rows: if expected_fields: for expected_field, expected_value in zip( expected_fields, expected_values): if expected_value in row[expected_field]: found_dict[expected_field] = True print 'found in ', row[expected_field] if unexpected_values: for unexpected_field, unexpected_value in zip( unexpected_fields, unexpected_values): if unexpected_value in row[unexpected_field]: self.fail( 'Unexpected field %s with unexpected value %s found' % (unexpected_field, unexpected_value)) if expected_fields: for field, found in found_dict.iteritems(): if not found: self.fail('Cannot find %s' % field) db_curs.close() db_conn.close()
def test_mysql_init_db(self): db_conn = dbu.mysql_init_db() self.assertTrue(db_conn, "Cannot initialize connection.")
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers[ 'Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS ): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join( dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf( swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join( swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def insert_domain_info_to_db(domaInfo): db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(domaInfo, db_conn) dbu.add_js_info_to_db(domaInfo, db_conn, site_info_id) db_conn.commit() db_conn.close()
def setUp(self): #test_db_file = ':memory:' #schema = '../swf_schema.sql' self.db_conn = dbu.mysql_init_db( 'fp_detective_test') # get a cursor for db self.db_cursor = self.db_conn.cursor(mdb.cursors.DictCursor)