def test_rand_str(self): # Test default parameters random_str = ut.rand_str() self.assertEqual(ut.DEFAULT_RAND_STR_SIZE, len(random_str), "rand_str does not return string with default size!") self.failIf(set(random_str) - set(ut.DEFAULT_RAND_STR_CHARS), "Unexpected characters in string!") # Test with different sizes and charsets sizes = [1, 2, 10, 100] charsets = (ut.DEFAULT_RAND_STR_CHARS, ut.DIGITS) for size in sizes: for charset in charsets: random_str = ut.rand_str(size, charset) self.assertEqual(len(random_str), size, "Unexpected random string size!") self.failIf(set(random_str) - set(ut.DEFAULT_RAND_STR_CHARS), "Unexpected characters in string!")
def test_disable_flash(self): lso_value = rand_str() qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value) test_url = cm.BASE_TEST_URL + '/evercookie/lso/setlso.html' + qry_str results = ffm.visit_page(test_url, wait_on_site=3, flash_support=cm.FLASH_DISABLE) lso_items = results["flash_cookies"] self.assertEqual(len(lso_items), 0)
def get_ff_cache(profile_dir, store_body=False): cache_dir = os.path.join(profile_dir, "Cache") if not os.path.isdir(cache_dir): return [] # Firefox updated the cache dir structure since our study cache_map = os.path.join(cache_dir, "_CACHE_MAP_") cache_dump = os.path.join(BASE_TMP_DIR, append_timestamp("cache") + rand_str()) create_dir(cache_dump) subprocess.call([PERL_PATH, CACHE_PERL_SCRIPT, cache_map, "--recover=" + cache_dump]) cache_items = [] db_items = ("Etag", "Request String", "Expires", "Cache-Control") for fname in glob(os.path.join(cache_dump, "*_metadata")): item = {} try: with open(fname) as f: metadata = f.read() item = parse_metadata(metadata) for db_item in db_items: if db_item not in item: item[db_item] = "" # If a response includes both an Expires header and a max-age # directive, the max-age directive overrides the Expires header # (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) expiry_delta_sec = 0 if "Expires" in item: # parse expiry date expiry = parse_date(item["Expires"]) if expiry: expiry_delta = expiry - datetime.now() expiry_delta_sec = expiry_delta.total_seconds() if "Cache-Control:" in item: # parse max-age directive cache_directives =\ parse_cache_control_header(item["Cache-Control"], cls=ResponseCacheControl) if "max-age" in cache_directives: expiry_delta_sec = cache_directives["max-age"] if expiry_delta_sec < DELTA_MONTH: continue item["Expiry-Delta"] = expiry_delta_sec with open(fname[:-9]) as f: data = f.read() item["Body"] = data if store_body else "" # store as BLOB item["Hash"] = hash_text(base64.b64encode(data)) except IOError as exc: print "Error processing cache: %s: %s" % (exc, traceback.format_exc()) cache_items.append(item) if os.path.isdir(cache_dump): shutil.rmtree(cache_dump) return cache_items
def get_ff_cache(profile_dir, store_body=False): cache_dir = os.path.join(profile_dir, "Cache") if not os.path.isdir(cache_dir): return [] # Firefox updated the cache dir structure since our study cache_map = os.path.join(cache_dir, "_CACHE_MAP_") cache_dump = os.path.join(BASE_TMP_DIR, append_timestamp("cache") + rand_str()) create_dir(cache_dump) subprocess.call( [PERL_PATH, CACHE_PERL_SCRIPT, cache_map, "--recover=" + cache_dump]) cache_items = [] db_items = ("Etag", "Request String", "Expires", "Cache-Control") for fname in glob(os.path.join(cache_dump, "*_metadata")): item = {} try: with open(fname) as f: metadata = f.read() item = parse_metadata(metadata) for db_item in db_items: if db_item not in item: item[db_item] = "" # If a response includes both an Expires header and a max-age # directive, the max-age directive overrides the Expires header # (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html) expiry_delta_sec = 0 if "Expires" in item: # parse expiry date expiry = parse_date(item["Expires"]) if expiry: expiry_delta = expiry - datetime.now() expiry_delta_sec = expiry_delta.total_seconds() if "Cache-Control:" in item: # parse max-age directive cache_directives =\ parse_cache_control_header(item["Cache-Control"], cls=ResponseCacheControl) if "max-age" in cache_directives: expiry_delta_sec = cache_directives["max-age"] if expiry_delta_sec < DELTA_MONTH: continue item["Expiry-Delta"] = expiry_delta_sec with open(fname[:-9]) as f: data = f.read() item["Body"] = data if store_body else "" # store as BLOB item["Hash"] = hash_text(base64.b64encode(data)) except IOError as exc: print "Error processing cache: %s: %s" % (exc, traceback.format_exc()) cache_items.append(item) if os.path.isdir(cache_dump): shutil.rmtree(cache_dump) return cache_items
def test_rand_str(self): # Test default parameters random_str = ut.rand_str() self.assertEqual(ut.DEFAULT_RAND_STR_SIZE, len(random_str), "rand_str does not return string with default size!") self.failIf( set(random_str) - set(ut.DEFAULT_RAND_STR_CHARS), "Unexpected characters in string!") # Test with different sizes and charsets sizes = [1, 2, 10, 100] charsets = (ut.DEFAULT_RAND_STR_CHARS, ut.DIGITS) for size in sizes: for charset in charsets: random_str = ut.rand_str(size, charset) self.assertEqual(len(random_str), size, "Unexpected random string size!") self.failIf( set(random_str) - set(ut.DEFAULT_RAND_STR_CHARS), "Unexpected characters in string!")
def test_get_lso_from_visit(self): lso_found = False lso_value = rand_str() qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value) test_url = cm.BASE_TEST_URL + '/evercookie/lso/setlso.html' + qry_str results = ffm.visit_page(test_url, wait_on_site=3) lso_items = results["flash_cookies"] self.failUnless(len(lso_items)) for test_lso in lso_items: self.assertEqual(test_lso.event_type, cm.EVENT_FLASH_LSO) self.assertIn(cm.ONLINE_TEST_HOST, test_lso.initiator) if TEST_LSO_KEYNAME == test_lso.key: self.assertEqual(lso_value, test_lso.log_text) lso_found = True self.failUnless(lso_found, "Cannot find LSO with the value %s in %s" % (lso_value, lso_items))
def test_get_lso_from_visit(self): lso_found = False lso_value = rand_str() qry_str = '?lso_test_key=%s&lso_test_value=%s' % ("test_key", lso_value) test_url = cm.BASE_TEST_URL + '/evercookie/lso/setlso.html' + qry_str results = ffm.visit_page(test_url, wait_on_site=3) lso_items = results["flash_cookies"] self.failUnless(len(lso_items)) for test_lso in lso_items: self.assertEqual(test_lso.event_type, cm.EVENT_FLASH_LSO) self.assertIn(cm.ONLINE_TEST_HOST, test_lso.initiator) if TEST_LSO_KEYNAME == test_lso.key: self.assertEqual(lso_value, test_lso.log_text) lso_found = True self.failUnless( lso_found, "Cannot find LSO with the value %s in %s" % (lso_value, lso_items))
def open_log_file(out_dir, url): if not os.path.isdir(out_dir): os.makedirs(out_dir) basename = get_basename_from_url(url, "ff-%s" % ut.rand_str()) return join(out_dir, '%s.log' % (basename))
def visit_page(url_tuple, timeout=cm.HARD_TIME_OUT, wait_on_site=cm.WAIT_ON_SITE, pre_crawl_sleep=False, out_dir=cm.BASE_TMP_DIR, flash_support=cm.FLASH_ENABLE, cookie_support=cm.COOKIE_ALLOW_ALL): driver = None visit_info = cm.VisitInfo() try: visit_info.rank, visit_info.url = url_tuple except: # When rank of the page is not provided, we'll use rank=0 visit_info.rank, visit_info.url = 0, url_tuple visit_info.sys_log = join( out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_log = join( out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_dump = join( out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str())) visit_info.start_time = strftime("%Y%m%d-%H%M%S") visit_info.out_dir = out_dir visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME) visit_info.err_log = join(out_dir, "error.log") visit_info.debug_log = join(out_dir, "debug.log") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT visit_info.ff_log = open_log_file(out_dir, visit_info.url) if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'): visit_info.url = 'http://' + visit_info.url try: visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, visit_info) cm.print_debug( visit_info, "Visiting: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) setup_nspr_logging(visit_info.http_log) visit_info.vdisplay = start_xvfb() port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump) driver, visit_info.profile_dir, visit_info.sel_proc =\ get_browser(visit_info.ff_log, port, flash_support, cookie_support) if flash_support: visit_info.strace_proc = log_syscalls(visit_info.sel_proc, visit_info.sys_log) ############################################################# driver_get(driver, visit_info, cm.SOFT_TIMEOUT) # real visit ############################################################# time.sleep(wait_on_site) close_driver(driver, timeout=10) stop_strace(visit_info.strace_proc) result_dict = process_crawler_output(visit_info.ff_log, visit_info, flash_support) cm.print_debug( visit_info, "Visit OK: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) visit_info.incomplete = 0 dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info) quit_driver(driver) stop_xvfb(visit_info.vdisplay) remove_visit_files(visit_info) except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc: err_str = "Visit to %s(%s) timed out %s" % \ (visit_info.url, visit_info.rank, texc) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None except Exception as exc: err_str = "Exception visiting %s(%s) %s %s" % \ (visit_info.url, visit_info.rank, exc, traceback.format_exc()) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None else: return result_dict
def test_write_to_file(self): filename = self.new_temp_file('write_test.txt') random_str = ut.rand_str(100) fu.write_to_file(filename, random_str) self.assertEqual(random_str, fu.read_file(filename))
def visit_page(url_tuple, timeout=cm.HARD_TIME_OUT, wait_on_site=cm.WAIT_ON_SITE, pre_crawl_sleep=False, out_dir=cm.BASE_TMP_DIR, flash_support=cm.FLASH_ENABLE, cookie_support=cm.COOKIE_ALLOW_ALL): driver = None visit_info = cm.VisitInfo() try: visit_info.rank, visit_info.url = url_tuple except: # When rank of the page is not provided, we'll use rank=0 visit_info.rank, visit_info.url = 0, url_tuple visit_info.sys_log = join(out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_log = join(out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_dump = join(out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str())) visit_info.start_time = strftime("%Y%m%d-%H%M%S") visit_info.out_dir = out_dir visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME) visit_info.err_log = join(out_dir, "error.log") visit_info.debug_log = join(out_dir, "debug.log") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT visit_info.ff_log = open_log_file(out_dir, visit_info.url) if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'): visit_info.url = 'http://' + visit_info.url try: visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, visit_info) cm.print_debug(visit_info, "Visiting: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) setup_nspr_logging(visit_info.http_log) visit_info.vdisplay = start_xvfb() port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump) driver, visit_info.profile_dir, visit_info.sel_proc =\ get_browser(visit_info.ff_log, port, flash_support, cookie_support) if flash_support: visit_info.strace_proc = log_syscalls(visit_info.sel_proc, visit_info.sys_log) ############################################################# driver_get(driver, visit_info, cm.SOFT_TIMEOUT) # real visit ############################################################# time.sleep(wait_on_site) close_driver(driver, timeout=10) stop_strace(visit_info.strace_proc) result_dict = process_crawler_output(visit_info.ff_log, visit_info, flash_support) cm.print_debug(visit_info, "Visit OK: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) visit_info.incomplete = 0 dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info) quit_driver(driver) stop_xvfb(visit_info.vdisplay) remove_visit_files(visit_info) except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc: err_str = "Visit to %s(%s) timed out %s" % \ (visit_info.url, visit_info.rank, texc) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None except Exception as exc: err_str = "Exception visiting %s(%s) %s %s" % \ (visit_info.url, visit_info.rank, exc, traceback.format_exc()) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None else: return result_dict