class TorBrowserWrapper(object): """Wraps the TorBrowserDriver to configure it at the constructor and run it with the `launch` method. We might consider to change the TorBrowserDriver itself to follow torcontroller and stem behaviour: init configures and a method is used to launch driver/controller, and this method is the one used to implement the contextmanager. """ def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs self.driver = None def __getattr__(self, item): if self.driver is None: return if item == "launch": return getattr(self, item) return getattr(self.driver, item) @contextmanager def launch(self): self.driver = TorBrowserDriver(*self.args, **self.kwargs) yield self.driver self.driver.quit()
class TorBrowserWrapper(object): """Wraps the TorBrowserDriver to configure it at the constructor and run it with the `launch` method. We might consider to change the TorBrowserDriver itself to follow torcontroller and stem behaviour: init configures and a method is used to launch driver/controller, and this method is the one used to implement the contextmanager. """ def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs self.driver = None def __getattr__(self, item): if self.driver is None: return if item == "launch": return getattr(self, item) return getattr(self.driver, item) @contextmanager def launch(self): self.driver = TorBrowserDriver(*self.args, **self.kwargs) yield self.driver self.driver.quit()
def test_close_all_streams(self): streams_open = False new_tb_drv = TorBrowserDriver(cm.TBB_DIR, tbb_logfile_path='test.log') new_tb_drv.get('http://www.google.com') time.sleep(30) self.tor_controller.close_all_streams() for stream in self.tor_controller.controller.get_streams(): print(stream.id, stream.purpose, stream.target_address, "open!") streams_open = True new_tb_drv.quit() self.assertFalse(streams_open, 'Could not close all streams.')
def test_close_all_streams(self): streams_open = False new_tb_drv = TorBrowserDriver(cm.TBB_PATH) new_tb_drv.get('http://www.google.com') time.sleep(30) self.tor_controller.close_all_streams() for stream in self.tor_controller.controller.get_streams(): print stream.id, stream.purpose, stream.target_address, "open!" streams_open = True new_tb_drv.quit() self.assertFalse(streams_open, 'Could not close all streams.')
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController(cm.TBB_PATH, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_PATH, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.kill_tor_proc()
class RunDriverWithControllerTest(unittest.TestCase): """ This test shows how to run tor with TorController and browse with TorBrowserDriver. """ @unittest.skip("Only for didactic purposes.") def test_run_driver_with_controller(self): # run controller on port N custom_socks_port = 6666 self.tor_controller = TorController( cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)}) self.tor_process = self.tor_controller.launch_tor_service() # set driver and get a page self.tor_driver = TorBrowserDriver(cm.TBB_DIR, socks_port=custom_socks_port) self.tor_driver.get("http://google.com") # shutdown self.tor_driver.quit() self.tor_controller.quit()
class Crawler: """Crawls your onions, but also manages Tor, drives Tor Browser, and uses information from your Tor cell log and stem to collect cell sequences.""" def __init__( self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt", "tbb", "tor-browser_en-US"), tb_log_path=join(_log_dir, "firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") # Set stem logging level to INFO - "high level library activity" stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO) self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config( config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data) def authenticate_to_tor_controlport(self): self.logger.info("Authenticating to the tor controlport...") try: self.controller = Controller.from_port(port=self.control_port) except stem.SocketError as exc: panic("Unable to connect to tor on port {self.control_port}: " "{exc}".format(**locals())) try: self.controller.authenticate() except stem.connection.MissingPassword: panic("Unable to authenticate to tor controlport. Please add " "`CookieAuth 1` to your tor configuration file.") def get_control_data(self, page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields): """Gather metadata about the crawler instance.""" control_data = {} # Configuration settings control_data["page_load_timeout"] = page_load_timeout control_data["wait_on_page"] = wait_on_page control_data["wait_after_closing_circuits"] = \ wait_after_closing_circuits if additional_control_fields: control_data.update(additional_control_fields) # System facts control_data["kernel"] = platform.system() control_data["kernel_version"] = platform.release() control_data["os"] = platform.version() control_data["python_version"] = platform.python_version() ip = urlopen("https://api.ipify.org").read().decode() control_data["ip"] = ip # This API seems to be unstable and we haven't found a suitable # alternative :( try: asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip)) asn_geoip = literal_eval(asn_geoip.read().decode()) control_data["asn"] = asn_geoip.get("ip").get("as").get("asn") control_data["city"] = asn_geoip.get("ip").get("city") control_data["country"] = asn_geoip.get("ip").get("country") except urllib.error.HTTPError: self.logger.warning("Unable to query ASN API and thus some " "control data may be missing from this run.") control_data["tor_version"] = self.controller.get_version().version_str control_data["tb_version"] = self.tb_driver.tb_version # Tor will have multiple entry nodes in its state file, but will # choose the first sequential one that is up as its entry guard. entry_nodes = self.controller.get_info("entry-guards").split('\n') control_data["entry_node"] = next( re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes if re.search("up", g)) control_data["crawler_version"] = _version return control_data def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() return self def __del__(self): self.close() def close(self): self.logger.info("Beginning Crawler exit process...") if "tb_driver" in dir(self): self.logger.info("Closing Tor Browser...") self.tb_driver.quit() if "virtual_framebuffer" in dir(self): self.logger.info("Closing the virtual framebuffer...") # A bug in pyvirtualdisplay triggers a KeyError exception when closing a # virtual framebuffer if the $DISPLAY environment variable is not set. try: stop_xvfb(self.virtual_framebuffer) except KeyError: pass if "cell_log" in dir(self): self.logger.info("Closing the Tor cell stream...") self.cell_log.close() if "tor_process" in dir(self): self.logger.info("Killing the tor process...") self.tor_process.kill() self.logger.info("Crawler exit completed.") def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None, iteration=0): """Crawl an onion service and collect a complete cell sequence for the activity at the time. Also, record additional information about the circuits with stem. Optionally, pass a function to execute additional actions after the page has loaded.""" # Todo: create collect_trace method that works for regular sites as # well assert ".onion" in url, ("This method is only suitable for crawling " "onion services.") self.logger.info("{url}: closing existing circuits before starting " "crawl.".format(**locals())) for circuit in self.controller.get_circuits(): self.controller.close_circuit(circuit.id) sleep(self.wait_after_closing_circuits) if not trace_dir: trace_dir = self.make_ts_dir() trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration) trace_path = join(trace_dir, trace_name) start_idx = self.get_cell_log_pos() try: self.crawl_url(url) rend_circ_ids = self.get_rend_circ_ids(url) if extra_fn: self.execute_extra_fn(url, trace_path, start_idx) except CrawlerLoggedError: return "failed" except CrawlerNoRendCircError: self.save_debug_log(url, trace_path, start_idx) return "failed" except: self.logger.exception("{url}: unusual exception " "encountered:".format(**locals())) # Also log active circuit info self.controller.get_circuits() exc_type, exc_value, exc_traceback = exc_info() if exc_type in _sketchy_exceptions: self.save_debug_log(url, trace_path, start_idx) if self.restart_on_sketchy_exception: self.restart_tb() return "failed" self.logger.info("{url}: saving full trace...".format(**locals())) end_idx = self.get_cell_log_pos() full_trace = self.get_full_trace(start_idx, end_idx) # Save the trace to the database or write to file if self.db_handler: try: new_example = { 'hsid': hsid, 'crawlid': self.crawlid, 't_scrape': get_timestamp("db") } except NameError: panic("If using the database, and calling collect_onion_trace " "directly, you must specify the hsid of the site.") exampleid = self.db_handler.add_example(new_example) self.db_handler.add_trace(str(full_trace), exampleid) else: with open(trace_path + "-full", "wb") as fh: fh.write(full_trace) return "succeeded" def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"): """Creates a timestamped folder to hold a group of traces.""" raw_dirpath = join(parent_dir, raw_dir_name) ts = get_timestamp("log") ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True) symlink_cur_to_latest(raw_dirpath, ts) with open(join(ts_dir, "control.pickle"), "wb") as fh: pickle.dump(self.control_data, fh) return ts_dir def get_cell_log_pos(self): """Returns the current position of the last byte in the Tor cell log.""" return self.cell_log.seek(0, SEEK_END) def crawl_url(self, url): """Load a web page in Tor Browser and optionally pass a function to execute custom actions on it.""" self.logger.info("{url}: starting page load...".format(**locals())) try: self.tb_driver.load_url(url, wait_on_page=self.wait_on_page, wait_for_page_body=True) except TimeoutException: self.logger.warning("{url}: timed out.".format(**locals())) raise CrawlerLoggedError except http.client.CannotSendRequest: self.logger.warning("{url}: cannot send request--improper " "connection state.".format(**locals())) raise CrawlerLoggedError # Make sure we haven't just hit an error page or nothing loaded try: if (self.tb_driver.is_connection_error_page or self.tb_driver.current_url == "about:newtab"): raise CrawlerReachedErrorPage except CrawlerReachedErrorPage: self.logger.warning("{url}: reached connection error " "page.".format(**locals())) raise CrawlerLoggedError self.logger.info("{url}: successfully loaded.".format(**locals())) def get_rend_circ_ids(self, url): """Returns the rendezvous circuit id(s) associated with a given onion service.""" self.logger.info("{url}: collecting circuit " "information...".format(**locals())) active_circs = self.controller.get_circuits() rend_circ_ids = set() for circ in active_circs: if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username and circ.socks_username in url): rend_circ_ids.add(circ.id) # If everything goes perfect, we should only see one. Multiple indicate # the first failed. Zero indicates one closed abruptly (or there's an # error with stem--still waiting on data to confirm or deny). rend_circ_ct = len(rend_circ_ids) self.logger.info( "{url}: {rend_circ_ct} associated rendezvous circuits " "discovered.".format(**locals())) if rend_circ_ct == 0: raise CrawlerNoRendCircError return rend_circ_ids def execute_extra_fn(self, url, trace_path, start_idx): self.logger.info("{url}: executing extra function " "code...".format(**locals())) extra_fn(self, url, trace_path, start_idx) self.logger.info("{url}: extra function executed " "successfully.".format(**locals())) def save_debug_log(self, url, trace_path, start_idx): self.logger.warning("{url}: saving debug log...".format(**locals())) exc_time = self.get_cell_log_pos() trace = self.get_full_trace(start_idx, exc_time) with open(trace_path + "@debug", "wb") as fh: fh.write(trace) def get_full_trace(self, start_idx, end_idx): """Returns the Tor DATA cells transmitted over a circuit during a specified time period.""" # Sanity check assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile " "position") assert end_idx > start_idx, ("logfile section end_idx must come " "after start_idx") self.cell_log.seek(start_idx, SEEK_SET) return self.cell_log.read(end_idx - start_idx) def restart_tb(self): """Restarts the Tor Browser.""" self.logger.info("Restarting the Tor Browser...") self.tb_driver.quit() self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=USE_RUNNING_TOR, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.logger.info("Tor Browser restarted...") def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None, iteration=0, shuffle=True, retry=True, url_to_id_mapping=None): """Collect a set of traces.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = url_set trace_dir = None elif not trace_dir: trace_dir = self.make_ts_dir() set_size = len(url_set) self.logger.info("Saving set of {set_size} traces to " "{trace_dir}.".format(**locals())) # Converts both sets (from pickle files) and dicts (whose keys are # URLs--from database) to URL lists url_set = list(url_set) if shuffle: random.shuffle(url_set) failed_urls = [] for url_idx in range(set_size): self.logger.info("Collecting trace {} of " "{set_size}...".format(url_idx + 1, **locals())) url = url_set[url_idx] if self.db_handler: hsid = url_to_id_mapping[url] else: hsid = None if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration) == "failed" and retry): failed_urls.append(url) if failed_urls: failed_ct = len(failed_urls) self.logger.info("Retrying {failed_ct} of {set_size} traces that " "failed.".format(**locals())) self.collect_set_of_traces(failed_urls, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration, shuffle=shuffle, retry=False, url_to_id_mapping=url_to_id_mapping) def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class, extra_fn=None, shuffle=True, retry=True, monitored_name="monitored", nonmonitored_name="nonmonitored", url_to_id_mapping=None, ratio=40): """Crawl a monitored class ratio times interspersed between the crawling of a(n ostensibly larger) non-monitored class.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = nonmonitored_class url_to_id_mapping.update(monitored_class) trace_dir, mon_trace_dir, nonmon_trace_dir = (None, ) * 3 else: trace_dir = self.make_ts_dir() mon_trace_dir = join(trace_dir, monitored_name) mkdir(mon_trace_dir) nonmon_trace_dir = join(trace_dir, nonmonitored_name) mkdir(nonmon_trace_dir) # db: calling list on a dict returns a list of its keys (URLs) # pickle: calling list on set is necessary to make it shuffleable nonmonitored_class = list(nonmonitored_class) monitored_class = list(monitored_class) nonmonitored_class_ct = len(nonmonitored_class) chunk_size = int(nonmonitored_class_ct / ratio) if shuffle: random.shuffle(nonmonitored_class) random.shuffle(monitored_class) for iteration in range(ratio): self.logger.info("Beginning iteration {i} of {ratio} in the " "{monitored_name} class".format(i=iteration + 1, **locals())) self.collect_set_of_traces(monitored_class, trace_dir=mon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping) slice_lb = iteration * chunk_size slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct) self.logger.info("Crawling services {} through {slice_ub} of " "{nonmonitored_class_ct} in the " "{nonmonitored_name} " "class".format(slice_lb + 1, **locals())) self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub], trace_dir=nonmon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping)
class Crawler: """Crawls your onions, but also manages Tor, drives Tor Browser, and uses information from your Tor cell log and stem to collect cell sequences.""" def __init__(self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt","tbb","tor-browser_en-US"), tb_log_path=join(_log_dir,"firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") # Set stem logging level to INFO - "high level library activity" stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO) self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config(config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data) def authenticate_to_tor_controlport(self): self.logger.info("Authenticating to the tor controlport...") try: self.controller = Controller.from_port(port=self.control_port) except stem.SocketError as exc: panic("Unable to connect to tor on port {self.control_port}: " "{exc}".format(**locals())) try: self.controller.authenticate() except stem.connection.MissingPassword: panic("Unable to authenticate to tor controlport. Please add " "`CookieAuth 1` to your tor configuration file.") def get_control_data(self, page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields): """Gather metadata about the crawler instance.""" control_data = {} # Configuration settings control_data["page_load_timeout"] = page_load_timeout control_data["wait_on_page"] = wait_on_page control_data["wait_after_closing_circuits"] = \ wait_after_closing_circuits if additional_control_fields: control_data.update(additional_control_fields) # System facts control_data["kernel"] = platform.system() control_data["kernel_version"] = platform.release() control_data["os"] = platform.version() control_data["python_version"] = platform.python_version() ip = urlopen("https://api.ipify.org").read().decode() control_data["ip"] = ip # This API seems to be unstable and we haven't found a suitable # alternative :( try: asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip)) asn_geoip = literal_eval(asn_geoip.read().decode()) control_data["asn"] = asn_geoip.get("ip").get("as").get("asn") control_data["city"] = asn_geoip.get("ip").get("city") control_data["country"] = asn_geoip.get("ip").get("country") except urllib.error.HTTPError: self.logger.warning("Unable to query ASN API and thus some " "control data may be missing from this run.") control_data["tor_version"] = self.controller.get_version().version_str control_data["tb_version"] = self.tb_driver.tb_version # Tor will have multiple entry nodes in its state file, but will # choose the first sequential one that is up as its entry guard. entry_nodes = self.controller.get_info("entry-guards").split('\n') control_data["entry_node"] = next(re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes if re.search("up", g)) control_data["crawler_version"] = _version return control_data def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() return self def __del__(self): self.close() def close(self): self.logger.info("Beginning Crawler exit process...") if "tb_driver" in dir(self): self.logger.info("Closing Tor Browser...") self.tb_driver.quit() if "virtual_framebuffer" in dir(self): self.logger.info("Closing the virtual framebuffer...") # A bug in pyvirtualdisplay triggers a KeyError exception when closing a # virtual framebuffer if the $DISPLAY environment variable is not set. try: stop_xvfb(self.virtual_framebuffer) except KeyError: pass if "cell_log" in dir(self): self.logger.info("Closing the Tor cell stream...") self.cell_log.close() if "tor_process" in dir(self): self.logger.info("Killing the tor process...") self.tor_process.kill() self.logger.info("Crawler exit completed.") def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None, iteration=0): """Crawl an onion service and collect a complete cell sequence for the activity at the time. Also, record additional information about the circuits with stem. Optionally, pass a function to execute additional actions after the page has loaded.""" # Todo: create collect_trace method that works for regular sites as # well assert ".onion" in url, ("This method is only suitable for crawling " "onion services.") self.logger.info("{url}: closing existing circuits before starting " "crawl.".format(**locals())) for circuit in self.controller.get_circuits(): self.controller.close_circuit(circuit.id) sleep(self.wait_after_closing_circuits) if not trace_dir: trace_dir = self.make_ts_dir() trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration) trace_path = join(trace_dir, trace_name) start_idx = self.get_cell_log_pos() try: self.crawl_url(url) rend_circ_ids = self.get_rend_circ_ids(url) if extra_fn: self.execute_extra_fn(url, trace_path, start_idx) except CrawlerLoggedError: return "failed" except CrawlerNoRendCircError: self.save_debug_log(url, trace_path, start_idx) return "failed" except: self.logger.exception("{url}: unusual exception " "encountered:".format(**locals())) # Also log active circuit info self.controller.get_circuits() exc_type, exc_value, exc_traceback = exc_info() if exc_type in _sketchy_exceptions: self.save_debug_log(url, trace_path, start_idx) if self.restart_on_sketchy_exception: self.restart_tb() return "failed" self.logger.info("{url}: saving full trace...".format(**locals())) end_idx = self.get_cell_log_pos() full_trace = self.get_full_trace(start_idx, end_idx) # Save the trace to the database or write to file if self.db_handler: try: new_example = {'hsid': hsid, 'crawlid': self.crawlid, 't_scrape': get_timestamp("db")} except NameError: panic("If using the database, and calling collect_onion_trace " "directly, you must specify the hsid of the site.") exampleid = self.db_handler.add_example(new_example) self.db_handler.add_trace(str(full_trace), exampleid) else: with open(trace_path+"-full", "wb") as fh: fh.write(full_trace) return "succeeded" def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"): """Creates a timestamped folder to hold a group of traces.""" raw_dirpath = join(parent_dir, raw_dir_name) ts = get_timestamp("log") ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True) symlink_cur_to_latest(raw_dirpath, ts) with open(join(ts_dir, "control.pickle"), "wb") as fh: pickle.dump(self.control_data, fh) return ts_dir def get_cell_log_pos(self): """Returns the current position of the last byte in the Tor cell log.""" return self.cell_log.seek(0, SEEK_END) def crawl_url(self, url): """Load a web page in Tor Browser and optionally pass a function to execute custom actions on it.""" self.logger.info("{url}: starting page load...".format(**locals())) try: self.tb_driver.load_url(url, wait_on_page=self.wait_on_page, wait_for_page_body=True) except TimeoutException: self.logger.warning("{url}: timed out.".format(**locals())) raise CrawlerLoggedError except http.client.CannotSendRequest: self.logger.warning("{url}: cannot send request--improper " "connection state.".format(**locals())) raise CrawlerLoggedError # Make sure we haven't just hit an error page or nothing loaded try: if (self.tb_driver.is_connection_error_page or self.tb_driver.current_url == "about:newtab"): raise CrawlerReachedErrorPage except CrawlerReachedErrorPage: self.logger.warning("{url}: reached connection error " "page.".format(**locals())) raise CrawlerLoggedError self.logger.info("{url}: successfully loaded.".format(**locals())) def get_rend_circ_ids(self, url): """Returns the rendezvous circuit id(s) associated with a given onion service.""" self.logger.info("{url}: collecting circuit " "information...".format(**locals())) active_circs = self.controller.get_circuits() rend_circ_ids = set() for circ in active_circs: if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username and circ.socks_username in url): rend_circ_ids.add(circ.id) # If everything goes perfect, we should only see one. Multiple indicate # the first failed. Zero indicates one closed abruptly (or there's an # error with stem--still waiting on data to confirm or deny). rend_circ_ct = len(rend_circ_ids) self.logger.info("{url}: {rend_circ_ct} associated rendezvous circuits " "discovered.".format(**locals())) if rend_circ_ct == 0: raise CrawlerNoRendCircError return rend_circ_ids def execute_extra_fn(self, url, trace_path, start_idx): self.logger.info("{url}: executing extra function " "code...".format(**locals())) extra_fn(self, url, trace_path, start_idx) self.logger.info("{url}: extra function executed " "successfully.".format(**locals())) def save_debug_log(self, url, trace_path, start_idx): self.logger.warning("{url}: saving debug log...".format(**locals())) exc_time = self.get_cell_log_pos() trace = self.get_full_trace(start_idx, exc_time) with open(trace_path + "@debug", "wb") as fh: fh.write(trace) def get_full_trace(self, start_idx, end_idx): """Returns the Tor DATA cells transmitted over a circuit during a specified time period.""" # Sanity check assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile " "position") assert end_idx > start_idx, ("logfile section end_idx must come " "after start_idx") self.cell_log.seek(start_idx, SEEK_SET) return self.cell_log.read(end_idx - start_idx) def restart_tb(self): """Restarts the Tor Browser.""" self.logger.info("Restarting the Tor Browser...") self.tb_driver.quit() self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=USE_RUNNING_TOR, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.logger.info("Tor Browser restarted...") def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None, iteration=0, shuffle=True, retry=True, url_to_id_mapping=None): """Collect a set of traces.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = url_set trace_dir = None elif not trace_dir: trace_dir = self.make_ts_dir() set_size = len(url_set) self.logger.info("Saving set of {set_size} traces to " "{trace_dir}.".format(**locals())) # Converts both sets (from pickle files) and dicts (whose keys are # URLs--from database) to URL lists url_set = list(url_set) if shuffle: random.shuffle(url_set) failed_urls = [] for url_idx in range(set_size): self.logger.info("Collecting trace {} of " "{set_size}...".format(url_idx+1, **locals())) url = url_set[url_idx] if self.db_handler: hsid = url_to_id_mapping[url] else: hsid = None if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration) == "failed" and retry): failed_urls.append(url) if failed_urls: failed_ct = len(failed_urls) self.logger.info("Retrying {failed_ct} of {set_size} traces that " "failed.".format(**locals())) self.collect_set_of_traces(failed_urls, extra_fn=extra_fn, trace_dir=trace_dir, iteration=iteration, shuffle=shuffle, retry=False, url_to_id_mapping=url_to_id_mapping) def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class, extra_fn=None, shuffle=True, retry=True, monitored_name="monitored", nonmonitored_name="nonmonitored", url_to_id_mapping=None, ratio=40): """Crawl a monitored class ratio times interspersed between the crawling of a(n ostensibly larger) non-monitored class.""" if self.db_handler: if not url_to_id_mapping: url_to_id_mapping = nonmonitored_class url_to_id_mapping.update(monitored_class) trace_dir, mon_trace_dir, nonmon_trace_dir = (None,) * 3 else: trace_dir = self.make_ts_dir() mon_trace_dir = join(trace_dir, monitored_name) mkdir(mon_trace_dir) nonmon_trace_dir = join(trace_dir, nonmonitored_name) mkdir(nonmon_trace_dir) # db: calling list on a dict returns a list of its keys (URLs) # pickle: calling list on set is necessary to make it shuffleable nonmonitored_class = list(nonmonitored_class) monitored_class = list(monitored_class) nonmonitored_class_ct = len(nonmonitored_class) chunk_size = int(nonmonitored_class_ct / ratio) if shuffle: random.shuffle(nonmonitored_class) random.shuffle(monitored_class) for iteration in range(ratio): self.logger.info("Beginning iteration {i} of {ratio} in the " "{monitored_name} class".format(i=iteration+1, **locals())) self.collect_set_of_traces(monitored_class, trace_dir=mon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping) slice_lb = iteration * chunk_size slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct) self.logger.info("Crawling services {} through {slice_ub} of " "{nonmonitored_class_ct} in the " "{nonmonitored_name} " "class".format(slice_lb + 1, **locals())) self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub], trace_dir=nonmon_trace_dir, iteration=iteration, url_to_id_mapping=url_to_id_mapping)
#!/usr/bin/env python3 from tbselenium.tbdriver import TorBrowserDriver import time from sys import argv, exit tbpath = "tor-browser_en-US" if len(argv) == 1: website = "about:blank" else: website = argv[1] driver = TorBrowserDriver(tbpath) driver.set_page_load_timeout(90) try: driver.load_url(website) except Exception as e: print(e) driver.quit() exit(1) time.sleep(1) driver.quit() exit(0)
def tor_web_crawler(index, link, ip_address): """ This function is a web crawler for collection of traffic traces and saving those traces to pcap files. :param index: current trace of the link :param link: webpage address from where traffic is to be collected :param ip_address: ip-addres of the machine from which traffic is to be collected :param timeout: duration upto which traffic information needs to be collected :param pkt_count: number of packets to be collected for a particular trace :return: """ # Extracting domain name for saving trace separately url = link lnk = tldextract.extract(url) domain_name = lnk.domain + '.' + lnk.suffix # print('Processing trace for domain name crawl : ', domain_name) # interface = 'enp0s31f6' # interface = 'any' interface = 'eth0' cap = DesiredCapabilities().FIREFOX cap["marionette"] = True # optional # driver = TorBrowserDriver(TBB_PATH) try: driver = TorBrowserDriver(TBB_PATH) # saving the pcapfiles PP = PACP_PATH + '/' + domain_name # saving the screen shots SS = SCREEN_SHOT + '/' + domain_name driver.get(url) except wde as e: print('Browser crashed:') print(e) print('Trying again in 10 seconds ...') time.sleep(10) driver = driver print('Success!\n') except Exception as e: raise Exception(e) if not os.path.isdir(PP): print('Creating directory for saving capture files (pcap) ...') os.makedirs(PP) else: pass if not os.path.isdir(SS): print('Creating directory for saving screenshots ...') os.makedirs(SS) else: pass # command to be executed for capturing the trace # command = "sudo tcpdump -i " + str(interface) + " -n host " + str(ip_address) + " -c " + str(pkt_count) + " -w " + PP + "/" + domain_name + "_" + str(index) + ".pcap " command = "sudo timeout 60 tcpdump -i " + str( interface) + " -n host " + str( ip_address) + " -w " + PP + "/" + domain_name + "_" + str( index) + ".pcap" print('Capture trace ...') capture = subprocess.Popen(command, shell=True) # time.sleep(1) capture.wait() print('Traffic trace captured and saved successfully.') # save the screenshot driver.save_screenshot(SS + '/' + domain_name + '-' + str(index) + '.png') print('Screen shot of the webpage saved successfully.') driver.quit()
class InstagramScraper(): """InstagramScraper: Web scraper class. This class is used represent the various browser types using numeric values instead of string values. This to allow for easy checking and changing/expanding of values. """ def __init__(self, browser_type, user_data_dir=None): # internal flag so we know what sort of web browser we are instantiating self.WebBrowserType = browser_type # various browser initiation according to different browser types if (browser_type == WebBrowserType.CHROME): print_dbg_msg_L1("\t[+] Starting Chrome...") options = webdriver.chrome.options.Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--no-default-browser-check") self.browser = webdriver.Chrome(chrome_options=options) elif (browser_type == WebBrowserType.CHROME_DEBUG): print_dbg_msg_L1("\t[+] Starting Chrome in debug mode...") options = webdriver.chrome.options.Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--no-default-browser-check") options.add_argument("--remote-debugging-port=9222") if user_data_dir == None: user_data_dir=chrome_debug_profile + "/" + str(secrets.token_hex(16)) #print_dbg_msg_L1("\t\t[+] User data dir: " + user_data_dir) if not os.path.exists(user_data_dir): os.makedirs(user_data_dir) options.add_argument("--user-data-dir=" + user_data_dir) self.browser = webdriver.Chrome(chrome_options=options) elif (browser_type == WebBrowserType.TOR): ''' Sometimes the Tor process fails to launch or the web browser fails to instantiate properly. Regardless, loop until both the Tor process and the browser is instantiated correctly. So far, over 30,000 runs, the instantiation usually kicks in after at most 1 failure. ''' while True: try: self.tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir) self.browser = TorBrowserDriver(tbb_dir, \ tor_cfg=cm.USE_STEM, \ tbb_profile_path=tbb_ff_default_dir, \ tbb_logfile_path=tbb_log_dir) except Exception as e: print_dbg_msg_L1("\t[+] " + str(e)) print_dbg_msg_L1("\t[+] Error instantiating browser, retrying...") time.sleep(1) continue else: break else: self.browser = webdriver.Firefox() def get(self, targetWebAddress): self.browser.get(targetWebAddress) def close(self): self.browser.quit() if self.WebBrowserType == WebBrowserType.TOR: self.tor_process.kill() def __exit__(self, exc_type, exc_value, traceback): self.browser.quit()
class Visit(object): """Hold info about a particular visit to a page.""" def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None, experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True): self.batch_num = batch_num self.site_num = site_num self.instance_num = instance_num self.page_url = page_url self.bg_site = bg_site self.experiment = experiment self.base_dir = base_dir self.visit_dir = None self.visit_log_dir = None self.tbb_version = cm.RECOMMENDED_TBB_VERSION self.capture_screen = capture_screen self.tor_controller = tor_controller self.xvfb = xvfb self.init_visit_dir() self.pcap_path = os.path.join( self.visit_dir, "{}.pcap".format(self.get_instance_name())) if self.xvfb and not cm.running_in_CI: wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) self.vdisplay.start() # Create new instance of TorBrowser driver TorBrowserDriver.add_exception(self.page_url) self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH, tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log")) self.sniffer = Sniffer() # sniffer to capture the network traffic def init_visit_dir(self): """Create results and logs directories for this visit.""" visit_name = str(self.instance_num) self.visit_dir = os.path.join(self.base_dir, visit_name) ut.create_dir(self.visit_dir) self.visit_log_dir = os.path.join(self.visit_dir, 'logs') ut.create_dir(self.visit_log_dir) def get_instance_name(self): """Construct and return a filename for the instance.""" inst_file_name = '{}_{}_{}' \ .format(self.batch_num, self.site_num, self.instance_num) return inst_file_name def filter_guards_from_pcap(self): guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the concensus.", len(guard_ips)) orig_pcap = self.pcap_path + ".original" copyfile(self.pcap_path, orig_pcap) try: preader = PcapReader(orig_pcap) pcap_filtered = [] for p in preader: if IP not in p: pcap_filtered.append(p) continue ip = p.payload if ip.dst in guard_ips or ip.src in guard_ips: pcap_filtered.append(p) wrpcap(self.pcap_path, pcap_filtered) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s", e, orig_pcap) else: os.remove(orig_pcap) def post_crawl(self): pass # TODO: add some sanity checks? def cleanup_visit(self): """Kill sniffer and Tor browser if they're running.""" wl_log.info("Cleaning up visit.") wl_log.info("Cancelling timeout") ut.cancel_timeout() if self.sniffer and self.sniffer.is_recording: wl_log.info("Stopping sniffer...") self.sniffer.stop_capture() # remove non-tor traffic self.filter_guards_from_pcap() if self.tb_driver and self.tb_driver.is_running: # shutil.rmtree(self.tb_driver.prof_dir_path) wl_log.info("Quitting selenium driver...") self.tb_driver.quit() # close all open streams to prevent pollution self.tor_controller.close_all_streams() if self.xvfb and not cm.running_in_CI: wl_log.info("Stopping display...") self.vdisplay.stop() # after closing driver and stoping sniffer, we run postcrawl self.post_crawl() def take_screenshot(self): try: out_png = os.path.join(self.visit_dir, 'screenshot.png') wl_log.info("Taking screenshot of %s to %s" % (self.page_url, out_png)) self.tb_driver.get_screenshot_as_file(out_png) if cm.running_in_CI: wl_log.debug("Screenshot data:image/png;base64,%s" % self.tb_driver.get_screenshot_as_base64()) except: wl_log.info("Exception while taking screenshot of: %s" % self.page_url) def get_wang_and_goldberg(self): """Visit the site according to Wang and Goldberg (WPES'13) settings.""" ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to stop the visit self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {}".format(self.page_url)) t1 = time.time() self.tb_driver.get(self.page_url) page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get_multitab(self): """Open two tab, use one to load a background site and the other to load the real site.""" PAUSE_BETWEEN_TAB_OPENINGS = 0.5 ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to kill running procs # load a blank page - a page is needed to send keys to the browser self.tb_driver.get(BAREBONE_HOME_PAGE) self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {} with {} in the background". format(self.page_url, self.bg_site)) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab # now that the focus is on the address bar, load the background # site by "typing" it to the address bar and "pressing" ENTER (\n) # simulated by send_keys function body.send_keys('%s\n' % self.bg_site) # the delay between the loading of background and real sites time.sleep(PAUSE_BETWEEN_TAB_OPENINGS) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab t1 = time.time() self.tb_driver.get(self.page_url) # load the real site in the 2nd tab page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit() def get(self): """Call the specific visit function depending on the experiment.""" if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: self.get_wang_and_goldberg() elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: self.get_multitab() else: raise ValueError("Cannot determine experiment type")
class FunctionalTest(object): gpg = None new_totp = None session_expiration = 30 secret_message = "These documents outline a major government invasion of privacy." timeout = 10 poll_frequency = 0.1 accept_languages = None default_driver_name = TORBROWSER driver = None firefox_driver = None torbrowser_driver = None driver_retry_count = 3 driver_retry_interval = 5 def _unused_port(self): s = socket.socket() s.bind(("127.0.0.1", 0)) port = s.getsockname()[1] s.close() return port def set_tbb_securitylevel(self, level): if level not in {SECURITY_HIGH, SECURITY_MEDIUM, SECURITY_LOW}: raise ValueError("Invalid Tor Browser security setting: " + str(level)) if hasattr(self, 'torbrowser_driver'): set_security_level(self.torbrowser_driver, level) def create_torbrowser_driver(self): logging.info("Creating TorBrowserDriver") log_file = open(LOGFILE_PATH, "a") log_file.write("\n\n[%s] Running Functional Tests\n" % str(datetime.now())) log_file.flush() # Don't use Tor when reading from localhost, and turn off private # browsing. We need to turn off private browsing because we won't be # able to access the browser's cookies in private browsing mode. Since # we use session cookies in SD anyway (in private browsing mode all # cookies are set as session cookies), this should not affect session # lifetime. pref_dict = { "network.proxy.no_proxies_on": "127.0.0.1", "browser.privatebrowsing.autostart": False, } if self.accept_languages is not None: pref_dict["intl.accept_languages"] = self.accept_languages for i in range(self.driver_retry_count): try: self.torbrowser_driver = TorBrowserDriver( TBB_PATH, tor_cfg=cm.USE_RUNNING_TOR, pref_dict=pref_dict, tbb_logfile_path=LOGFILE_PATH, ) logging.info("Created Tor Browser web driver") self.torbrowser_driver.set_window_position(0, 0) self.torbrowser_driver.set_window_size(1024, 1200) break except Exception as e: logging.error("Error creating Tor Browser web driver: %s", e) if i < self.driver_retry_count: time.sleep(self.driver_retry_interval) if not self.torbrowser_driver: raise Exception("Could not create Tor Browser web driver") def create_firefox_driver(self): logging.info("Creating Firefox web driver") profile = webdriver.FirefoxProfile() if self.accept_languages is not None: profile.set_preference("intl.accept_languages", self.accept_languages) profile.update_preferences() for i in range(self.driver_retry_count): try: self.firefox_driver = webdriver.Firefox( firefox_binary=FIREFOX_PATH, firefox_profile=profile) self.firefox_driver.set_window_position(0, 0) self.firefox_driver.set_window_size(1024, 1200) logging.info("Created Firefox web driver") break except Exception as e: logging.error("Error creating Firefox web driver: %s", e) if i < self.driver_retry_count: time.sleep(self.driver_retry_interval) if not self.firefox_driver: raise Exception("Could not create Firefox web driver") def switch_to_firefox_driver(self): if not self.firefox_driver: self.create_firefox_driver() self.driver = self.firefox_driver logging.info("Switched %s to Firefox driver: %s", self, self.driver) def switch_to_torbrowser_driver(self): if self.torbrowser_driver is None: self.create_torbrowser_driver() self.driver = self.torbrowser_driver logging.info("Switched %s to TorBrowser driver: %s", self, self.driver) def disable_js_torbrowser_driver(self): if hasattr(self, 'torbrowser_driver'): disable_js(self.torbrowser_driver) def start_source_server(self, source_port): config.SESSION_EXPIRATION_MINUTES = self.session_expiration / 60.0 self.source_app.run(port=source_port, debug=True, use_reloader=False, threaded=True) @pytest.fixture(autouse=True) def set_default_driver(self): logging.info("Creating default web driver: %s", self.default_driver_name) if self.default_driver_name == FIREFOX: self.switch_to_firefox_driver() else: self.switch_to_torbrowser_driver() yield try: if self.torbrowser_driver: self.torbrowser_driver.quit() except Exception as e: logging.error("Error stopping TorBrowser driver: %s", e) try: if self.firefox_driver: self.firefox_driver.quit() except Exception as e: logging.error("Error stopping Firefox driver: %s", e) @pytest.fixture(autouse=True) def sd_servers(self): logging.info("Starting SecureDrop servers (session expiration = %s)", self.session_expiration) # Patch the two-factor verification to avoid intermittent errors logging.info("Mocking models.Journalist.verify_token") with mock.patch("models.Journalist.verify_token", return_value=True): logging.info("Mocking source_app.main.get_entropy_estimate") with mock.patch("source_app.main.get_entropy_estimate", return_value=8192): try: signal.signal(signal.SIGUSR1, lambda _, s: traceback.print_stack(s)) source_port = self._unused_port() journalist_port = self._unused_port() self.source_location = "http://127.0.0.1:%d" % source_port self.journalist_location = "http://127.0.0.1:%d" % journalist_port self.source_app = source_app.create_app(config) self.journalist_app = journalist_app.create_app(config) self.journalist_app.config["WTF_CSRF_ENABLED"] = True self.__context = self.journalist_app.app_context() self.__context.push() env.create_directories() db.create_all() self.gpg = env.init_gpg() # Add our test user try: valid_password = "******" user = Journalist(username="******", password=valid_password, is_admin=True) user.otp_secret = "JHCOGO7VCER3EJ4L" db.session.add(user) db.session.commit() except IntegrityError: logging.error("Test user already added") db.session.rollback() # This user is required for our tests cases to login self.admin_user = { "name": "journalist", "password": ("correct horse battery staple" " profanity oil chewy"), "secret": "JHCOGO7VCER3EJ4L", } self.admin_user["totp"] = pyotp.TOTP( self.admin_user["secret"]) def start_journalist_server(app): app.run(port=journalist_port, debug=True, use_reloader=False, threaded=True) self.source_process = Process( target=lambda: self.start_source_server(source_port)) self.journalist_process = Process( target=lambda: start_journalist_server(self. journalist_app)) self.source_process.start() self.journalist_process.start() for tick in range(30): try: requests.get(self.source_location, timeout=1) requests.get(self.journalist_location, timeout=1) except Exception: time.sleep(0.25) else: break yield finally: try: self.source_process.terminate() except Exception as e: logging.error("Error stopping source app: %s", e) try: self.journalist_process.terminate() except Exception as e: logging.error("Error stopping source app: %s", e) env.teardown() self.__context.pop() def wait_for_source_key(self, source_name): filesystem_id = self.source_app.crypto_util.hash_codename(source_name) def key_available(filesystem_id): assert self.source_app.crypto_util.get_fingerprint(filesystem_id) self.wait_for(lambda: key_available(filesystem_id), timeout=60) def create_new_totp(self, secret): self.new_totp = pyotp.TOTP(secret) def wait_for(self, function_with_assertion, timeout=None): """Polling wait for an arbitrary assertion.""" # Thanks to # http://chimera.labs.oreilly.com/books/1234000000754/ch20.html#_a_common_selenium_problem_race_conditions if timeout is None: timeout = self.timeout start_time = time.time() while time.time() - start_time < timeout: try: return function_with_assertion() except (AssertionError, WebDriverException): time.sleep(self.poll_frequency) # one more try, which will raise any errors if they are outstanding return function_with_assertion() def safe_click_by_id(self, element_id): """ Clicks the element with the given ID attribute. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.ID, element_id))) el.location_once_scrolled_into_view el.click() return el def safe_click_by_css_selector(self, selector): """ Clicks the first element with the given CSS selector. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, selector))) el.click() return el def safe_click_all_by_css_selector(self, selector, root=None): """ Clicks each element that matches the given CSS selector. Returns: els (list): The list of elements that matched the selector. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ if root is None: root = self.driver els = self.wait_for( lambda: root.find_elements_by_css_selector(selector)) for el in els: clickable_el = WebDriverWait( self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, selector))) clickable_el.click() return els def safe_send_keys_by_id(self, element_id, text): """ Sends the given text to the element with the specified ID. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.ID, element_id))) el.send_keys(text) return el def safe_send_keys_by_css_selector(self, selector, text): """ Sends the given text to the first element with the given CSS selector. Returns: el: The element, if found. Raises: selenium.common.exceptions.TimeoutException: If the element cannot be found in time. """ el = WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, selector))) el.send_keys(text) return el def alert_wait(self, timeout=None): if timeout is None: timeout = self.timeout * 10 WebDriverWait(self.driver, timeout, self.poll_frequency).until( expected_conditions.alert_is_present(), "Timed out waiting for confirmation popup.") def alert_accept(self): # adapted from https://stackoverflow.com/a/34795883/837471 def alert_is_not_present(object): """ Expect an alert to not be present.""" try: alert = self.driver.switch_to.alert alert.text return False except NoAlertPresentException: return True self.driver.switch_to.alert.accept() WebDriverWait(self.driver, self.timeout, self.poll_frequency).until( alert_is_not_present, "Timed out waiting for confirmation popup to disappear.")
class Browser: def __init__(self, config, browser, pet, env_type, proxy_setting): """ If given valid proxy settings, this function will configure socks5 proxy properly on chrome (brave) and firefox. """ def setup_socks5_proxy(browser, profile, proxy_setting): if proxy_setting is not None: address = proxy_setting["address"] port = proxy_setting["port"] bypass_list = proxy_setting["bypass-list"] if browser == "chrome": # https://sordidfellow.wordpress.com/2015/05/21/ssh-tunnel-for-chrome/ profile.add_argument("--proxy-server=socks5://%s:%s" % (address, port)) profile.add_argument("--proxy-bypass-list=%s" % bypass_list) print("socks5 proxy configured on chrome") elif browser == "firefox": # https://developer.mozilla.org/en-US/docs/Mozilla/Preferences/Mozilla_networking_preferences profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.socks", address) profile.set_preference("network.proxy.socks_port", port) profile.set_preference("network.proxy.socks_version", 5) profile.set_preference("network.proxy.socks_remote_dns", "true") profile.set_preference("network.proxy.no_proxies_on", bypass_list) print("socks5 proxy configured on firefox") """ If the program is run in a virtual machine, xvfbwrapper has to get installed first. """ self.env_type = env_type if (env_type == "vm"): print("xvfb") from xvfbwrapper import Xvfb width, height, depth = get_display_parameters(config) self.vdisplay = Xvfb(width=width, height=height, colordepth=depth) self.vdisplay.start() print("Browser:", browser, "PET:", pet) pet_config = PetConfig() if pet == "brave": print("brave") chrome_options = ChromeOptions() bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type) print(bPath, dPath) chromedriver = dPath chrome_options.binary_location = bPath setup_socks5_proxy("chrome", chrome_options, proxy_setting) os.environ["webdriver.chrome.driver"] = chromedriver if env_type == "vm": chrome_options.add_argument("--no-sandbox") self.driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options) press_enter(1) return elif pet == "tor": plt= platform.system().lower() if plt == "darwin" or plt == "windows": # https://stackoverflow.com/questions/15316304/open-tor-browser-with-selenium print("native tor") bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type) print(bPath, dPath) profile = FirefoxProfile() profile.set_preference("network.proxy.type", 0) binary = FirefoxBinary(bPath) self.driver = webdriver.Firefox(firefox_profile = profile, firefox_binary= binary, executable_path = dPath) elif plt == "linux": # https://medium.com/@manivannan_data/selenium-with-tor-browser-using-python-7b3606b8c55c print("vm tor") from tbselenium.tbdriver import TorBrowserDriver pref_dict = {"network.proxy.no_proxies_on": "http://10.0.2.2/, http://192.168.4.204/"} self.driver = TorBrowserDriver(os.environ['TBB_PATH'], pref_dict = pref_dict) return aPath, bPath, dPath, pref = pet_config.getPetBrowserDriverPath(pet,browser,env_type) if (browser == "firefox"): fp = FirefoxProfile() setup_socks5_proxy("firefox", fp, proxy_setting) binary = FirefoxBinary(bPath) if pref != None: fp.set_preference(pref[0],pref[1]) self.driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary, executable_path=dPath) if (aPath): self.driver.install_addon(aPath) elif (browser == "chrome"): chrome_options = ChromeOptions() chrome_options = webdriver.ChromeOptions() #https://github.com/SeleniumHQ/selenium/issues/5966 setup_socks5_proxy("chrome", chrome_options, proxy_setting) if aPath: chrome_options.add_extension(aPath) if pref != None: chrome_options.add_experimental_option(pref[0],pref[1]) chrome_options.binary_location = bPath os.environ["webdriver.chrome.driver"] = dPath time.sleep(1) self.driver = webdriver.Chrome(executable_path=dPath, chrome_options=chrome_options) # to escape the alert chrome display on first visit time.sleep(1) press_enter(1) elif(browser == "safari"): self.driver = webdriver.Safari() else: print("Unsupported Browser") sys.exit(0) def quit(self): try: self.driver.quit() except: self.driver.close() # for Tor if (self.env_type == "vm"): self.vdisplay.stop() def visit_sites(self, site_list, delay=5): """Visits all pages in site_list with delay""" for site in site_list: sys.stdout.write(".") sys.stdout.flush() try: self.driver.get(site) time.sleep(delay) except: print("Unexpected error:", sys.exc_info()[0])
class UntitledTestCase(unittest.TestCase): def setUp(self): print 'Loading...' self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = TorBrowserDriver( '/scratch/zilton/troll/tor-browser_pt-BR/', tbb_logfile_path='test.log') # self.driver = webdriver.Chrome('chromium-browser') self.base_url = "https://lemonade.ctweb.inweb.org.br/#/workflows/1/" self.verificationErrors = [] self.accept_next_alert = True def is_visible(self, locator, timeout=20): try: ui.WebDriverWait(self.driver, timeout).until( ec.visibility_of_element_located((By.ID, locator))) return True except TimeoutException: return False def is_not_visible(self, locator, timeout=2): try: ui.WebDriverWait(self.driver, timeout).until_not( ec.visibility_of_element_located((By.ID, locator))) return True except TimeoutException: return False def test_untitled_test_case(self): global workflow_message_error_warning, workflow_message_completed driver = self.driver '''Login''' driver.get("https://lemonade.ctweb.inweb.org.br/#/login") driver.find_element_by_xpath("//input[@type='email']").clear() driver.find_element_by_xpath("//input[@type='email']").send_keys( lemonade_login) driver.find_element_by_xpath("//input[@type='password']").clear() driver.find_element_by_xpath("//input[@type='password']").send_keys( lemonade_password) driver.find_element_by_xpath("//button[@type='submit']").click() time.sleep(LOAD_TIME) count_progress = 1.0 length = len(workflow_ids) index = 0 count_problem = 1 while index < length: workflow_id = workflow_ids[index] '''Access the page of the workflow''' url = self.base_url + str(workflow_id) driver.get(url) '''Execute the workflow''' while True: try: time.sleep(LOAD_TIME * 0.2) driver.find_element_by_id("tlb-execute-wf").click() break except Exception: pass while True: try: time.sleep(LOAD_TIME * 0.2) driver.find_element_by_id("mdl-execute-wf").click() break except Exception: pass '''Monitoring the status of the execution''' time.sleep(LOAD_TIME) status = WAITING_MSG current_url = driver.current_url # Workflow with problem if current_url == "https://lemonade.ctweb.inweb.org.br/#/" and count_problem < MAX_LOAD_PROBLEM: count_problem += 1 continue elif count_problem == MAX_LOAD_PROBLEM: status = WARNING_MSG while (status is WAITING_MSG) or (status == RUNNING_MSG): while True: try: status = str( driver.find_element_by_id("dtl-job-status"). get_attribute(name='title').upper()) if status: break time.sleep(LOAD_TIME * 0.2) except Exception: pass if (status == WAITING_MSG) or (status == RUNNING_MSG): driver.refresh() time.sleep(LOAD_TIME) '''Main message after the execution ends''' message = '' if status != WARNING_MSG: while message == '': try: message = driver.find_element_by_id( "dtl-job-status-text").text break except Exception: pass driver.refresh() time.sleep(LOAD_TIME) workflow_name = '' while True and count_problem < MAX_LOAD_PROBLEM: try: time.sleep(LOAD_TIME * 0.2) workflow_name = driver.find_element_by_xpath( "//a[contains(@href, '#/workflows/1/%s')]" % workflow_id).text break except Exception: pass if status == WARNING_MSG: message += ' - The execution presented an atypical problem. ' \ 'Please check the workflow and the correct ' \ 'update of the messages on the Lemonade page.' msg_dict = { 'workflow_name': workflow_name, 'workflow_id': workflow_id, 'message': message, 'status': status, 'url': url } if status != COMPLETED_MSG: workflow_message_error_warning.append(msg_dict) else: workflow_message_completed += " " + workflow_id UntitledTestCase.update_progress( job_title='Testing Lemonade workflow: ', progress=count_progress) count_progress += 1 index += 1 count_problem = 1 self.driver.close() @staticmethod def update_progress(job_title, progress): global workflow_ids length = len(workflow_ids) progress = progress / length block = int(round(length * progress)) message = "\r{0}: [{1}] {2}%".format( job_title, ', '.join(workflow_ids[:int(progress * length)]) + "-" * (length - block), round(progress * 100, 2)) if progress >= 1: message += " DONE\r\n" sys.stdout.write(message) sys.stdout.flush() def is_element_present(self, how, what): try: self.driver.find_element(by=how, value=what) except NoSuchElementException: return False return True def is_alert_present(self): try: self.driver.switch_to_alert() except NoAlertPresentException: return False return True def close_alert_and_get_its_text(self): try: alert = self.driver.switch_to_alert() alert_text = alert.text if self.accept_next_alert: alert.accept() else: alert.dismiss() return alert_text finally: self.accept_next_alert = True def tearDown(self): UntitledTestCase.sendEmail() self.driver.quit() self.display.stop() self.assertEqual([], self.verificationErrors) @staticmethod def sendEmail(): global workflow_message_error_warning, workflow_message_completed if len(workflow_message_error_warning) > 0: workflow_message_completed = re.sub("^\s+|\s+$", "", workflow_message_completed) message = 'WORKFLOWS THAT PERFORMED CORRECTLY: %s' % ( workflow_message_completed.replace(' ', ', ')) message += '\n\nWORKFLOWS THAT DID NOT RUN SUCCESSFULLY:\n' for m in workflow_message_error_warning: if m['status'] == WARNING_MSG: message += '\n- WORKFLOW: %s' % m['workflow_id'] else: message += '\n- WORKFLOW: %s' % m['workflow_name'] message += '\n\tSTATUS: %s' % m['status'] message += '\n\tMESSAGE: %s' % m['message'] message += '\n\tURL: %s' % m['url'] message += '\n___________________________\n' subject = "[LEMONADE] - Automatic Test for Workflows" email_sender.main(message_status_report=message.encode('utf-8'), subject=subject)
class CrawlerTest(unittest.TestCase): def setUp(self): # clean dirs if isdir(TEST_DIRS): shutil.rmtree(TEST_DIRS) os.mkdir(TEST_DIRS) cm.CONFIG_FILE = os.path.join(cm.TEST_FILES_DIR, 'config.ini') self.config = ConfigParser.RawConfigParser() self.config.read(cm.CONFIG_FILE) def configure_crawler(self, crawl_type, config_section): device = netifaces.gateways()['default'][netifaces.AF_INET][1] tbb_dir = os.path.abspath(cm.TBB_DIR) # Configure controller torrc_config = ut.get_dict_subconfig(self.config, config_section, "torrc") self.controller = TorController(tbb_dir, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(self.config, config_section, "ffpref") tbb_logfile_path = os.path.join(cm.LOGS_DIR, cm.FF_LOG_FILENAME) socks_port = int(torrc_config['socksport']) self.driver = TorBrowserDriver(tbb_dir, tbb_logfile_path=tbb_logfile_path, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=socks_port, canvas_allowed_hosts=[]) # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + crawl_type) screenshots = True self.crawler = crawl_type(self.driver, self.controller, device=device, screenshots=screenshots) # Configure job self.job_config = ut.get_dict_subconfig(self.config, config_section, "job") # Run display virtual_display = '' self.xvfb_display = setup_virtual_display(virtual_display) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_crawl(self): self.configure_crawler('Base', 'captcha_test') job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl') self.run_crawl(job) # TODO: test for more conditions... self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0) shutil.rmtree(cm.CRAWL_DIR) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_cloudflare_captcha_page(self): expected_pcaps = 2 self.configure_crawler('WebFP', 'captcha_test') url = 'https://cloudflare.com/' job = crawler_mod.CrawlJob(self.job_config, [url]) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_cloudflare_captcha_results') build_crawl_dirs() os.chdir(cm.CRAWL_DIR) try: self.crawler.crawl(job) # we can pass batch and instance numbers finally: self.driver.quit() self.controller.quit() capture_dirs = glob(os.path.join(cm.CRAWL_DIR, 'captcha_*')) self.assertEqual(expected_pcaps, len(capture_dirs)) shutil.rmtree(cm.CRAWL_DIR) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_not_captcha_after_captcha(self): self.configure_crawler('WebFP', 'captcha_test') known_captcha_url = 'https://cloudflare.com' known_not_captcha_url = 'https://check.torproject.org/' urls = [known_captcha_url, known_not_captcha_url] job = crawler_mod.CrawlJob(self.job_config, urls) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_not_captcha_after_captcha') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): marked_captcha = _dir.startswith('captcha_') is_torproject_dir = 'check.torproject.org' in _dir if is_torproject_dir: self.assertTrue(not marked_captcha) else: self.assertTrue(marked_captcha) shutil.rmtree(cm.CRAWL_DIR) @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_captcha_not_captcha_2_batches(self): self.configure_crawler('WebFP', 'test_captcha_not_captcha_2_batches') known_captcha_url = 'https://cloudflare.com' known_not_captcha_url = 'https://check.torproject.org/' urls = [known_captcha_url, known_not_captcha_url] job = crawler_mod.CrawlJob(self.job_config, urls) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_not_captcha_after_captcha') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): marked_captcha = _dir.startswith('captcha_') is_torproject_dir = 'check.torproject.org' in _dir if is_torproject_dir: self.assertTrue(not marked_captcha) else: self.assertTrue(marked_captcha) shutil.rmtree(cm.CRAWL_DIR) def test_website_in_capture_dir(self): self.configure_crawler('WebFP', 'captcha_test') url = 'https://cloudflare.com/' job = crawler_mod.CrawlJob(self.job_config, [url]) cm.CRAWL_DIR = os.path.join(cm.TEST_DIR, 'test_website_in_capture_dir') self.run_crawl(job) for _dir in os.listdir(cm.CRAWL_DIR): self.assertTrue('cloudflare.com' in _dir) shutil.rmtree(cm.CRAWL_DIR) def run_crawl(self, job): build_crawl_dirs() os.chdir(cm.CRAWL_DIR) try: self.crawler.crawl(job) # we can pass batch and instance numbers finally: self.driver.quit() self.controller.quit() #@pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI') def test_middle(self): self.configure_crawler('Middle', 'captcha_test') job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST) cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl') self.run_crawl(job) # TODO: test for more conditions... self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0) shutil.rmtree(cm.CRAWL_DIR)