def quit(self): """ Overrides the base class method cleaning the timestamped profile. """ self.is_running = False try: wl_log.info("Quit: Removing profile dir") shutil.rmtree(self.prof_dir_path) super(TorBrowserDriver, self).quit() except CannotSendRequest: wl_log.error("CannotSendRequest while quitting TorBrowserDriver", exc_info=False) # following is copied from webdriver.firefox.webdriver.quit() which # was interrupted due to an unhandled CannotSendRequest exception. # kill the browser self.binary.kill() # remove the profile folder try: shutil.rmtree(self.profile.path) if self.profile.tempfolder is not None: shutil.rmtree(self.profile.tempfolder) except Exception as e: print(str(e)) except Exception: wl_log.error("Exception while quitting TorBrowserDriver", exc_info=True)
def clone_dir_with_timestap(orig_dir_path): """Copy a folder into the same directory and append a timestamp.""" new_dir = create_dir(append_timestamp(orig_dir_path)) try: du.copy_tree(orig_dir_path, new_dir) except Exception, e: wl_log.error("Error while cloning the dir with timestamp" + str(e))
def get_screenshot_if_enabled(self): if self.screenshots: try: with ut.timeout(5): self.driver.get_screenshot_as_file(self.job.png_file) except Exception: wl_log.error("Cannot get screenshot.")
def main(): # Parse arguments args = parse_arguments() # Read URLs url_list = read_list_urls(args) # Get torrc matching experiment type torrc_dict = cm.TORRC_BY_TYPE[args.experiment] # Instantiate crawler crawler = Crawler(url_list, torrc_dict, output=args.output, experiment=args.experiment, xvfb=args.xvfb, capture_screen=True) # Run the crawl try: crawler.crawl(args.batches, args.instances, start_line=args.start_line - 1) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") except Exception as e: wl_log.error("Exception: \n%s" % (traceback.format_exc())) finally: crawler.stop_crawl()
def filter_packets_without_guard_ip(self): guard_ips = set([ip for ip in self.controller.get_all_guard_ips()]) wl_log.info("Found %s guards in the consensus.", len(guard_ips)) wl_log.info("Filtering packets without a guard IP.") try: ut.filter_tshark(self.job.tshark_file, guard_ips) except Exception as e: wl_log.error("ERROR: filtering tshark log: %s.", e) wl_log.error("Check tshark log: %s", self.job.thsark_file)
def post_visit(self): guard_ips = set([ip for ip in self.controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the consensus.", len(guard_ips)) wl_log.info("Filtering packets without a guard IP.") try: ut.filter_pcap(self.job.pcap_file, guard_ips) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s.", e) wl_log.error("Check pcap: %s", self.job.pcap_file)
def init_tbb_profile(self, version): profile_directory = cm.get_tbb_profile_path(version) self.prof_dir_path = clone_dir_with_timestap(profile_directory) if self.capture_screen and self.page_url: self.add_canvas_permission() try: tbb_profile = webdriver.FirefoxProfile(self.prof_dir_path) except Exception: wl_log.error("Error creating the TB profile", exc_info=True) else: return tbb_profile
def __do_visit(self): with Sniffer(path=self.job.pcap_file, filter=cm.DEFAULT_FILTER): sleep(1) # make sure dumpcap is running try: with ut.timeout(cm.HARD_VISIT_TIMEOUT): self.driver.get(self.job.url) sleep(float(self.job.config['pause_in_site'])) except (cm.HardTimeoutException, TimeoutException): wl_log.error("Visit to %s has timed out!", self.job.url) except Exception as exc: wl_log.error("Unknown exception: %s", exc)
def crawl_urls(br_type, urls, fn=lambda x: x): for url in urls: try: br = init_browser(br_type) except: wl_log.critical('Init browser') else: try: crawl_url(br, url, fn) except Exception as e: wl_log.error("Error crawling %s: %s" % (url, e)) br.quit()
def crawl_urls(br_type, urls, fn=lambda x:x): for url in urls: try: br = init_browser(br_type) except: wl_log.critical('Init browser') else: try: crawl_url(br, url, fn) except Exception as e: wl_log.error("Error crawling %s: %s" %(url, e)) br.quit()
def parse_url_list(file_path, start, stop): """Return list of urls from a file.""" url_list = [] try: with open(file_path) as f: file_contents = f.read() url_list = file_contents.splitlines() url_list = url_list[start - 1:stop] except Exception as e: wl_log.error("ERROR: while parsing URL list: {} \n{}".format(e, traceback.format_exc())) sys.exit(1) return url_list
def __do_instance(self): for self.job.visit in xrange(self.job.visits): ut.create_dir(self.job.path) wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url) with self.driver.launch(): try: self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except WebDriverException as seto_exc: wl_log.error("Setting soft timeout %s", seto_exc) self.__do_visit() if self.screenshots: try: self.driver.get_screenshot_as_file(self.job.png_file) except WebDriverException: wl_log.error("Cannot get screenshot.") sleep(float(self.job.config['pause_between_visits'])) self.post_visit()
def extract_tbb_tarball(archive_path): arch_dir = os.path.dirname(archive_path) extracted_dir = os.path.join(arch_dir, "tor-browser_en-US") tar_cmd = "tar xvf %s -C %s" % (archive_path, arch_dir) status, txt = commands.getstatusoutput(tar_cmd) if status or not os.path.isdir(extracted_dir): wl_log.error("Error extracting TBB tarball %s: (%s: %s)" % (tar_cmd, status, txt)) return False dest_dir = archive_path.split(".tar")[0] mv_cmd = "mv %s %s" % (extracted_dir, dest_dir) status, txt = commands.getstatusoutput(mv_cmd) if status or not os.path.isdir(dest_dir): wl_log.error("Error moving extracted TBB with the command %s: (%s: %s)" % (mv_cmd, status, txt)) return False return True
def extract_tbb_tarball(archive_path): arch_dir = os.path.dirname(archive_path) extracted_dir = os.path.join(arch_dir, "tor-browser_en-US") tar_cmd = "tar xvf %s -C %s" % (archive_path, arch_dir) status, txt = commands.getstatusoutput(tar_cmd) if status or not os.path.isdir(extracted_dir): wl_log.error("Error extracting TBB tarball %s: (%s: %s)" % (tar_cmd, status, txt)) return False dest_dir = archive_path.split(".tar")[0] mv_cmd = "mv %s %s" % (extracted_dir, dest_dir) status, txt = commands.getstatusoutput(mv_cmd) if status or not os.path.isdir(dest_dir): wl_log.error( "Error moving extracted TBB with the command %s: (%s: %s)" % (mv_cmd, status, txt)) return False return True
def __do_instance(self): for self.job.visit in range(self.job.visits): ut.create_dir(self.job.path) wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url) # BrowserWrapper开始实际地构造一个driver对象 # __enter__开启一个新浏览器 和 __exit__时退出 # 问题:每次driver.quit()后 新开启的driver会重用之前临时创建的profile with self.driver.launch(): try: self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except WebDriverException as seto_exc: wl_log.error("Setting soft timeout %s", seto_exc) self.__do_visit() if self.screenshots: try: self.driver.get_screenshot_as_file(self.job.png_file) except WebDriverException: wl_log.error("Cannot get screenshot.") sleep(float(self.job.config['pause_between_visits'])) self.post_visit()
def filter_guards_from_pcap(self): guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the concensus.", len(guard_ips)) orig_pcap = self.pcap_path + ".original" copyfile(self.pcap_path, orig_pcap) try: preader = PcapReader(orig_pcap) pcap_filtered = [] for p in preader: if IP not in p: pcap_filtered.append(p) continue ip = p.payload if ip.dst in guard_ips or ip.src in guard_ips: pcap_filtered.append(p) wrpcap(self.pcap_path, pcap_filtered) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s", e, orig_pcap) else: os.remove(orig_pcap)
def __do_instance(self): for self.job.visit in xrange(self.job.visits): ut.create_dir(self.job.path) wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url) with self.driver.launch(): try: self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except WebDriverException as seto_exc: wl_log.error("Setting soft timeout %s", seto_exc) self.__do_visit() if self.screenshots: try: self.driver.get_screenshot_as_file(self.job.png_file) except WebDriverException: wl_log.error("Cannot get screenshot.") if self.har_export: try: jscript = "return HAR.triggerExport().then(harLog => {return harLog;});" har_string = self.driver.execute_script(jscript) with open(self.job.har_file, 'w') as fd: json.dump(har_string, fd) except WebDriverException: wl_log.error("Cannot export HAR.") sleep(float(self.job.config['pause_between_visits'])) self.post_visit()
def die(last_words='Unknown problem, quitting!'): wl_log.error(last_words) sys.exit(1)
def die(last_words="Unknown problem, quitting!"): wl_log.error(last_words) sys.exit(1)
def run(): # Parse arguments args, config = parse_arguments() # build dirs build_crawl_dirs() # Read URLs if isfile(args.urls): url_list = parse_url_list(args.urls, args.start, args.stop) else: try: url_list = args.urls.split(',') except Exception as e: wl_log.error("ERROR: expects a string with comma-separated list " "of URLs of a path to file") host_list = [urlparse(url).hostname for url in url_list] # Configure logger add_log_file_handler(wl_log, cm.CRAWL_LOG_FILENAME) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(tbb_path=args.tbb_path, tor_binary_path=args.tor_binary_path, tor_data_path=args.tor_data_path, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") ffprefs = ut.set_dict_value_types(ffprefs) print(ffprefs) addons_path = [abspath(args.addons_dir)] if args.addons_dir else [] driver_config = {'tbb_path': cm.TBB_DIR, 'tor_cfg': USE_RUNNING_TOR, 'pref_dict': ffprefs, 'extensions': addons_path, 'socks_port': int(torrc_config['socksport']), 'control_port': int(torrc_config['controlport']), 'canvas_allowed_hosts': host_list } # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + args.type) crawler = crawl_type(controller, driver_config=driver_config, device=args.device, screenshots=args.screenshots) # Configure crawl if args.recover_file is not None: if isfile(args.recover_file): with open(args.recover_file) as fchkpt: job = pickle.load(fchkpt) wl_log.info("Job recovered: %s" % str(job)) else: wl_log.error("Checkpoint file %s does not exist" % args.recover_file) sys.exit(1) else: # parse job configuration job_config = ut.get_dict_subconfig(config, args.config, "job") # get chunk of urls to crawl chunk = int(job_config.get('chunk', 0)) chunks = int(job_config.get('chunks', 1)) range_chunk = len(url_list) / chunks if chunk == chunks - 1: # last chunk takes remaining urls url_list_chunk = url_list[chunk * range_chunk:] else: url_list_chunk = url_list[chunk * range_chunk:(chunk + 1) * range_chunk] job = crawler_mod.CrawlJob(job_config, url_list_chunk) # Run display xvfb_display = setup_virtual_display(args.virtual_display) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) except Exception as e: wl_log.error("ERROR: unknown exception while crawling: %s" % e) finally: #driver.quit() #controller.quit() # Post crawl post_crawl() # Close display ut.stop_xvfb(xvfb_display) # die wl_log.info("[tbcrawler] the crawl has finished.") sys.exit(0)
def __init__(self, tbb_binary_path=None, tbb_profile_dir=None, tbb_logfile_path=None, tbb_version=cm.TBB_DEFAULT_VERSION, page_url='', capture_screen=True): self.is_running = False self.tbb_version = tbb_version self.export_lib_path() # Initialize Tor Browser's profile self.page_url = page_url self.capture_screen = capture_screen self.profile = self.init_tbb_profile(tbb_version) # set homepage to a blank tab self.profile.set_preference('browser.startup.page', '0') self.profile.set_preference('browser.startup.homepage', 'about:newtab') # configure Firefox to use Tor SOCKS proxy self.profile.set_preference('network.proxy.type', 1) self.profile.set_preference('network.proxy.socks', '127.0.0.1') self.profile.set_preference('network.proxy.socks_port', cm.SOCKS_PORT) if cm.DISABLE_RANDOMIZEDPIPELINENING: self.profile.set_preference( 'network.http.pipelining.max-optimistic-requests', 5000) self.profile.set_preference('network.http.pipelining.maxrequests', 15000) self.profile.set_preference('network.http.pipelining', False) self.profile.set_preference('extensions.torlauncher.prompt_at_startup', 0) # Disable cache - Wang & Goldberg's setting self.profile.set_preference('network.http.use-cache', False) self.profile.set_preference('webdriver.load.strategy', 'conservative') # prevent Tor Browser running it's own Tor process self.profile.set_preference('extensions.torlauncher.start_tor', False) self.profile.set_preference( 'extensions.torbutton.versioncheck_enabled', False) self.profile.set_preference('permissions.memory_only', False) self.profile.update_preferences() # Initialize Tor Browser's binary self.binary = self.get_tbb_binary(tbb_version=self.tbb_version, logfile=tbb_logfile_path) # Initialize capabilities self.capabilities = DesiredCapabilities.FIREFOX self.capabilities.update({ 'handlesAlerts': True, 'databaseEnabled': True, 'javascriptEnabled': True, 'browserConnectionEnabled': True }) try: super(TorBrowserDriver, self).__init__(firefox_profile=self.profile, firefox_binary=self.binary, capabilities=self.capabilities) self.is_running = True except WebDriverException as error: wl_log.error( 'WebDriverException while connecting to Webdriver %s' % error) except socket.error as skterr: wl_log.error('Error connecting to Webdriver', exc_info=True) wl_log.error(skterr) except Exception as e: wl_log.error('Error connecting to Webdriver: %s' % e, exc_info=True)
def __init__(self, tbb_binary_path=None, tbb_profile_dir=None, tbb_logfile_path=None, tbb_version=cm.TBB_DEFAULT_VERSION, page_url="", capture_screen=True): #self.session_id = None self.is_running = False self.tbb_version = tbb_version self.export_lib_path() # Initialize Tor Browser's profile self.page_url = page_url self.capture_screen = capture_screen self.profile = self.init_tbb_profile(tbb_version) # set homepage to a blank tab self.profile.set_preference('browser.startup.page', "0") self.profile.set_preference('browser.startup.homepage', 'about:newtab') # configure Firefox to use Tor SOCKS proxy self.profile.set_preference('network.proxy.type', 1) self.profile.set_preference('network.proxy.socks', '127.0.0.1') self.profile.set_preference('network.proxy.socks_port', cm.SOCKS_PORT) if cm.DISABLE_RANDOMIZEDPIPELINENING: self.profile.set_preference( 'network.http.pipelining.max-optimistic-requests', 5000) self.profile.set_preference('network.http.pipelining.maxrequests', 15000) self.profile.set_preference('network.http.pipelining', False) self.profile.set_preference('extensions.torlauncher.prompt_at_startup', 0) # Disable cache - Wang & Goldberg's setting self.profile.set_preference('network.http.use-cache', False) # http://www.w3.org/TR/webdriver/#page-load-strategies-1 # wait for all frames to load and make sure there's no # outstanding http requests (except AJAX) # https://code.google.com/p/selenium/wiki/DesiredCapabilities self.profile.set_preference('webdriver.load.strategy', 'conservative') # Note that W3C doesn't mention "conservative", this may change in the # upcoming versions of the Firefox Webdriver # https://w3c.github.io/webdriver/webdriver-spec.html#the-page-load-strategy # prevent Tor Browser running it's own Tor process self.profile.set_preference('extensions.torlauncher.start_tor', False) self.profile.set_preference( 'extensions.torbutton.versioncheck_enabled', False) self.profile.set_preference('permissions.memory_only', False) self.profile.update_preferences() # Initialize Tor Browser's binary self.binary = self.get_tbb_binary(tbb_version=self.tbb_version, logfile=tbb_logfile_path) # Initialize capabilities self.capabilities = DesiredCapabilities.PHANTOMJS self.capabilities.update({ 'handlesAlerts': True, 'databaseEnabled': True, 'browserConnectionEnabled': True, 'javascriptEnabled': True }) # 'javascriptEnabled': True}) #,'handlesAlerts': True,'databaseEnabled': True, 'browserConnectionEnabled': True service_args = [ '--proxy=127.0.0.1:%s' % (cm.SOCKS_PORT), '--proxy-type=socks5', ] try: super(TorBrowserDriver, self)\ .__init__(executable_path="/usr/bin/phantomjs", desired_capabilities=self.capabilities, service_args=service_args) self.is_running = True except WebDriverException as error: wl_log.error( "WebDriverException while connecting to Webdriver %s" % error) except socket.error as skterr: wl_log.error("Error connecting to Webdriver", exc_info=True) wl_log.error(skterr.message) except Exception as e: wl_log.error("Error connecting to Webdriver: %s" % e, exc_info=True)
if verbose: wl_log.setLevel(logging.DEBUG) else: wl_log.setLevel(logging.INFO) # Validate the given arguments # Read urls url_list = np.loadtxt(url_list_path, delimiter='\n', dtype=str) url_list = url_list.tolist() url_list = url_list[start_line - 1:stop_line] torrc_dict = cm.TORRC_DEFAULT if not tbb_version: tbb_version = cm.TBB_DEFAULT_VERSION elif tbb_version not in cm.TBB_KNOWN_VERSIONS: ut.die('Version of Tor browser is not recognized.') crawler = Crawler(torrc_dict, url_list, tbb_version, xvfb, capture_screen) wl_log.info('Command line parameters: %s' % sys.argv) # Run the crawl try: crawler.crawl(no_of_batches, no_of_instances, start_line=start_line - 1) except KeyboardInterrupt: wl_log.warning('Keyboard interrupt! Quitting...') except Exception as e: wl_log.error('Exception: \n%s' % (traceback.format_exc())) finally: crawler.stop_crawl()
def set_page_load_timeout(self): try: self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except WebDriverException as seto_exc: wl_log.error("Setting soft timeout %s", seto_exc)