def test_check_canvas_rw_access(self): common_url = "http://example.com/fp.js" be1 = cm.BrowserEvent() be1.event_type = cm.EVENT_TODATAURL be1.js_file = common_url be2 = cm.BrowserEvent() be2.event_type = cm.EVENT_FILLTEXT be2.js_file = common_url self.assertEqual([common_url], ex.check_canvas_rw_access([be1, be2]))
def test_check_canvas_rw_access_for_diff_js(self): url1 = "http://example.com/fp.js" url2 = "http://example.com/ga.js" be1 = cm.BrowserEvent() be1.event_type = cm.EVENT_TODATAURL be1.js_file = url1 be2 = cm.BrowserEvent() be2.event_type = cm.EVENT_FILLTEXT be2.js_file = url2 self.assertEqual([], ex.check_canvas_rw_access([be1, be2]))
def ff_log_parser(log_file, visit_info): events = [] fp_events = [] for line in gen_cat_file(log_file): # print line if line.startswith("FPD"): line = line.replace("FPD\t", "").rstrip() be = cm.BrowserEvent() log_items = line.split("\t") if len(log_items) == 5: be.initiator, be.event_type, be.js_file,\ be.js_line, be.log_text = log_items be.url = visit_info.url be.rank = visit_info.rank events.append(be) elif len(log_items) == 4 and log_items[1] in cm.EVENTS_WITH_4_ARGS: be.initiator, cm.event_type, be.js_file,\ be.js_line = log_items # filter out screenshots taken by browser for page thumbnails if not be.js_line.startswith('resource'): events.append(be) else: print "Hmm WAT!:", log_items canvas_fp_urls = check_canvas_rw_access(events) for canvas_fp_url in canvas_fp_urls: for event in events: # we only add events from this URL if event.js_file == canvas_fp_url: fp_events.append(event) return fp_events, events
def get_canvas_event(cur, canvas_ev_id): cur.execute("SELECT * FROM canvas WHERE id=:Id", {"Id": canvas_ev_id}) canvas_row = cur.fetchone() be = cm.BrowserEvent() _, visit_id, be.url, be.js_file, be.js_line, be.event_type,\ event_time, data_url_id = canvas_row return visit_id, data_url_id, event_time, be
def test_update_visit(self): test_url = "http://example.com" start_time = strftime("%Y%m%d-%H%M%S") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT vi = cm.VisitInfo() vi.url = test_url vi.start_time = start_time vi.out_db = self.test_db vi.duration = 0 vi.incomplete = 1 vi.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi) vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID, vi.visit_id) self.assertEqual(vi.duration, vi_read.duration) self.assertEqual(vi.incomplete, vi_read.incomplete) vi.duration = 33 vi.incomplete = 0 dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, vi) vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID, vi.visit_id) self.assertEqual(vi.duration, vi_read.duration) self.assertEqual(vi.incomplete, vi_read.incomplete)
def parse_strace_logs(visit_info, test_lso=None): events = [] lines_seen = Set() # cm.print_log(visit_info, visit_info.sys_log, cm.LOG_DEBUG) for line in fu.gen_cat_file(visit_info.sys_log): if (".macromedia/Flash_Player/#SharedObjects" not in line or (line in lines_seen)): continue lines_seen.add(line) try: pieces = line.split("#SharedObjects/")[1].split("/") lso_file, mode_str = pieces[-1].split("\"") if "O_RDWR" in mode_str: mode = cm.ACCESS_MODE_READ_WRITE else: mode = cm.ACCESS_MODE_READ_ONLY domain = pieces[1] filename = line.split("\"")[1] file_ext = filename.rsplit(".", 1)[-1] # We observe .sxx extension for LSOs instead of .sol in strace logs # We recover the real filename by replacing # [pid 26407] open("/home/xyz/.macromedia/Flash_Player/#SharedObjects/GWZSHMBL/securehomes.esat.kuleuven.be/FlashCookie.sxx", O_RDWR|O_CREAT|O_APPEND, 0666) = 18 # noqa # [pid 26407] write(18, "\0\277\0\0\0-TCSO\0\4\0\0\0\0\0\vFlashCookie\0\0\0\3\21test_key\6\rjb0uf9\0", 51) = 51 # noqa # the only file in SharedObjects/GWZSHMBL/securehomes.esat.kuleuven.be would be a .sol file, which is not in the strace logs if it's created on this visit. # noqa if file_ext == "sxx": filename = filename.replace(".sxx", ".sol") if test_lso is not None: # just to simplify tests filename = test_lso # override the lso filename if not filename.endswith(".sol"): print "Unexpected LSO file extension", filename, file_ext, line continue if not os.path.isfile(filename): # Happens when the (temp) LSO is removed before the visit ends. print "Cannot find LSO file", filename, file_ext, line continue except ValueError: continue # TODO: move below code to a separate function try: lso_dict = sol.load(filename) for k, v in lso_dict.iteritems(): be = cm.BrowserEvent() be.event_type = cm.EVENT_FLASH_LSO be.js_file = lso_file be.initiator = domain be.mode = mode # this doesn't seem to work be.cookie_path = filename.split("#SharedObjects/")[1] be.key = unicode(k) try: be.log_text = unicode(v) except UnicodeDecodeError: # obj is byte string ascii_text = str(v).encode('string_escape') be.log_text = unicode(ascii_text) events.append(be) except Exception as exc: cm.print_log(visit_info, "Error parsing LSO %s %s" % (filename, exc)) return events
def test_r_w_visit_to_db(self): test_url = "http://example.com" start_time = strftime("%Y%m%d-%H%M%S") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT vi = cm.VisitInfo() vi.url = test_url vi.start_time = start_time vi.out_db = self.test_db visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi) vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID, visit_id) self.assertEqual(vi.url, vi_read.url) self.assertEqual(vi.start_time, vi_read.start_time)
def test_r_w_canvas_to_db(self): be = cm.BrowserEvent() be.event_type = cm.EVENT_TODATAURL be.url = "http://example.com" be.js_file = "http://example.com/fp.js" be.js_line = 5 be.txt = "data:asdsads" vi = cm.VisitInfo() vi.visit_id = 1 vi.out_db = self.test_db canvas_ev_id = dbu.insert_to_db(dbu.DBCmd.ADD_CANVAS, be, vi) self.assertGreater(canvas_ev_id, 0) visit_id, data_url_id, event_time, be_db = \ dbu.get_db_entry(self.test_db, dbu.DBCmd.CANVAS_BY_ID, canvas_ev_id) self.assertEqual(vi.visit_id, visit_id) self.assertEqual(data_url_id, 1) self.assertEqual(be_db.event_type, be.event_type) self.assertEqual(be_db.url, be.url) self.assertEqual(be_db.js_file, be.js_file) self.assertEqual(be_db.js_line, be.js_line) self.assertEqual(event_time, 0)
def visit_page(url_tuple, timeout=cm.HARD_TIME_OUT, wait_on_site=cm.WAIT_ON_SITE, pre_crawl_sleep=False, out_dir=cm.BASE_TMP_DIR, flash_support=cm.FLASH_ENABLE, cookie_support=cm.COOKIE_ALLOW_ALL): driver = None visit_info = cm.VisitInfo() try: visit_info.rank, visit_info.url = url_tuple except: # When rank of the page is not provided, we'll use rank=0 visit_info.rank, visit_info.url = 0, url_tuple visit_info.sys_log = join( out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_log = join( out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_dump = join( out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str())) visit_info.start_time = strftime("%Y%m%d-%H%M%S") visit_info.out_dir = out_dir visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME) visit_info.err_log = join(out_dir, "error.log") visit_info.debug_log = join(out_dir, "debug.log") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT visit_info.ff_log = open_log_file(out_dir, visit_info.url) if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'): visit_info.url = 'http://' + visit_info.url try: visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, visit_info) cm.print_debug( visit_info, "Visiting: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) setup_nspr_logging(visit_info.http_log) visit_info.vdisplay = start_xvfb() port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump) driver, visit_info.profile_dir, visit_info.sel_proc =\ get_browser(visit_info.ff_log, port, flash_support, cookie_support) if flash_support: visit_info.strace_proc = log_syscalls(visit_info.sel_proc, visit_info.sys_log) ############################################################# driver_get(driver, visit_info, cm.SOFT_TIMEOUT) # real visit ############################################################# time.sleep(wait_on_site) close_driver(driver, timeout=10) stop_strace(visit_info.strace_proc) result_dict = process_crawler_output(visit_info.ff_log, visit_info, flash_support) cm.print_debug( visit_info, "Visit OK: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) visit_info.incomplete = 0 dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info) quit_driver(driver) stop_xvfb(visit_info.vdisplay) remove_visit_files(visit_info) except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc: err_str = "Visit to %s(%s) timed out %s" % \ (visit_info.url, visit_info.rank, texc) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None except Exception as exc: err_str = "Exception visiting %s(%s) %s %s" % \ (visit_info.url, visit_info.rank, exc, traceback.format_exc()) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None else: return result_dict