def test_get_fp_from_reqs(self): for fp_url in fp_urls: self.assertTrue(lp.get_fp_from_reqs([ fp_url, ]), 'Cannot find fp in %s' % fp_url) self.assertTrue(lp.get_fp_from_reqs( fp_urls)) # should return something different if it finds fp urls self.assertFalse(lp.get_fp_from_reqs(['foo.org', 'foo.com', 'bar.org'])) #
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename + '.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker( msg, crawl_id ) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as exc: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int( os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename +'.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as _: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def test_get_fp_from_reqs(self): for fp_url in fp_urls: self.assertTrue(lp.get_fp_from_reqs([fp_url,]), 'Cannot find fp in %s' % fp_url) self.assertTrue(lp.get_fp_from_reqs(fp_urls)) # should return something different if it finds fp urls self.assertFalse(lp.get_fp_from_reqs(['foo.org', 'foo.com', 'bar.org'])) #