Пример #1
0
class TorBrowserWrapper(object):
    """Wraps the TorBrowserDriver to configure it at the constructor
    and run it with the `launch` method.

    We might consider to change the TorBrowserDriver itself to follow
    torcontroller and stem behaviour: init configures and a method is
    used to launch driver/controller, and this method is the one used
    to implement the contextmanager.
    """
    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
        self.driver = None

    def __getattr__(self, item):
        if self.driver is None:
            return
        if item == "launch":
            return getattr(self, item)
        return getattr(self.driver, item)

    @contextmanager
    def launch(self):
        self.driver = TorBrowserDriver(*self.args, **self.kwargs)
        yield self.driver
        self.driver.quit()
Пример #2
0
class TorBrowserWrapper(object):
    """Wraps the TorBrowserDriver to configure it at the constructor
    and run it with the `launch` method.

    We might consider to change the TorBrowserDriver itself to follow
    torcontroller and stem behaviour: init configures and a method is
    used to launch driver/controller, and this method is the one used
    to implement the contextmanager.
    """
    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
        self.driver = None

    def __getattr__(self, item):
        if self.driver is None:
            return
        if item == "launch":
            return getattr(self, item)
        return getattr(self.driver, item)

    @contextmanager
    def launch(self):
        self.driver = TorBrowserDriver(*self.args, **self.kwargs)
        yield self.driver
        self.driver.quit()
 def test_close_all_streams(self):
     streams_open = False
     new_tb_drv = TorBrowserDriver(cm.TBB_DIR, tbb_logfile_path='test.log')
     new_tb_drv.get('http://www.google.com')
     time.sleep(30)
     self.tor_controller.close_all_streams()
     for stream in self.tor_controller.controller.get_streams():
         print(stream.id, stream.purpose, stream.target_address, "open!")
         streams_open = True
     new_tb_drv.quit()
     self.assertFalse(streams_open, 'Could not close all streams.')
 def test_close_all_streams(self):
     streams_open = False
     new_tb_drv = TorBrowserDriver(cm.TBB_PATH)
     new_tb_drv.get('http://www.google.com')
     time.sleep(30)
     self.tor_controller.close_all_streams()
     for stream in self.tor_controller.controller.get_streams():
         print stream.id, stream.purpose, stream.target_address, "open!"
         streams_open = True
     new_tb_drv.quit()
     self.assertFalse(streams_open, 'Could not close all streams.')
class RunDriverWithControllerTest(unittest.TestCase):
    """
    This test shows how to run tor with TorController and browse with TorBrowserDriver.
    """

    @unittest.skip("Only for didactic purposes.")
    def test_run_driver_with_controller(self):
        # run controller on port N
        custom_socks_port = 6666
        self.tor_controller = TorController(cm.TBB_PATH, torrc_dict={'SocksPort': str(custom_socks_port)})
        self.tor_process = self.tor_controller.launch_tor_service()

        # set driver and get a page
        self.tor_driver = TorBrowserDriver(cm.TBB_PATH, socks_port=custom_socks_port)
        self.tor_driver.get("http://google.com")

        # shutdown
        self.tor_driver.quit()
        self.tor_controller.kill_tor_proc()
class RunDriverWithControllerTest(unittest.TestCase):
    """
    This test shows how to run tor with TorController and browse with TorBrowserDriver.
    """
    @unittest.skip("Only for didactic purposes.")
    def test_run_driver_with_controller(self):
        # run controller on port N
        custom_socks_port = 6666
        self.tor_controller = TorController(
            cm.TBB_DIR, torrc_dict={'SocksPort': str(custom_socks_port)})
        self.tor_process = self.tor_controller.launch_tor_service()

        # set driver and get a page
        self.tor_driver = TorBrowserDriver(cm.TBB_DIR,
                                           socks_port=custom_socks_port)
        self.tor_driver.get("http://google.com")

        # shutdown
        self.tor_driver.quit()
        self.tor_controller.quit()
Пример #7
0
class Crawler:
    """Crawls your onions, but also manages Tor, drives Tor Browser, and uses
    information from your Tor cell log and stem to collect cell sequences."""
    def __init__(
            self,
            take_ownership=True,  # Tor dies when the Crawler does
            torrc_config={"CookieAuth": "1"},
            tor_log="/var/log/tor/tor.log",
            tor_cell_log="/var/log/tor/tor_cell_seq.log",
            control_port=9051,
            socks_port=9050,
            run_in_xvfb=True,
            tbb_path=join("/opt", "tbb", "tor-browser_en-US"),
            tb_log_path=join(_log_dir, "firefox.log"),
            tb_tor_cfg=USE_RUNNING_TOR,
            page_load_timeout=20,
            wait_on_page=5,
            wait_after_closing_circuits=0,
            restart_on_sketchy_exception=True,
            additional_control_fields={},
            db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(
            config=self.torrc_config, take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)

    def authenticate_to_tor_controlport(self):
        self.logger.info("Authenticating to the tor controlport...")
        try:
            self.controller = Controller.from_port(port=self.control_port)
        except stem.SocketError as exc:
            panic("Unable to connect to tor on port {self.control_port}: "
                  "{exc}".format(**locals()))
        try:
            self.controller.authenticate()
        except stem.connection.MissingPassword:
            panic("Unable to authenticate to tor controlport. Please add "
                  "`CookieAuth 1` to your tor configuration file.")

    def get_control_data(self, page_load_timeout, wait_on_page,
                         wait_after_closing_circuits,
                         additional_control_fields):
        """Gather metadata about the crawler instance."""
        control_data = {}
        # Configuration settings
        control_data["page_load_timeout"] = page_load_timeout
        control_data["wait_on_page"] = wait_on_page
        control_data["wait_after_closing_circuits"] = \
                wait_after_closing_circuits
        if additional_control_fields:
            control_data.update(additional_control_fields)
        # System facts
        control_data["kernel"] = platform.system()
        control_data["kernel_version"] = platform.release()
        control_data["os"] = platform.version()
        control_data["python_version"] = platform.python_version()
        ip = urlopen("https://api.ipify.org").read().decode()
        control_data["ip"] = ip
        # This API seems to be unstable and we haven't found a suitable
        # alternative :(
        try:
            asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip))
            asn_geoip = literal_eval(asn_geoip.read().decode())
            control_data["asn"] = asn_geoip.get("ip").get("as").get("asn")
            control_data["city"] = asn_geoip.get("ip").get("city")
            control_data["country"] = asn_geoip.get("ip").get("country")
        except urllib.error.HTTPError:
            self.logger.warning("Unable to query ASN API and thus some "
                                "control data may be missing from this run.")
        control_data["tor_version"] = self.controller.get_version().version_str
        control_data["tb_version"] = self.tb_driver.tb_version
        # Tor will have multiple entry nodes in its state file, but will
        # choose the first sequential one that is up as its entry guard.
        entry_nodes = self.controller.get_info("entry-guards").split('\n')
        control_data["entry_node"] = next(
            re.search("[0-9A-F]{40}", g).group(0) for g in entry_nodes
            if re.search("up", g))
        control_data["crawler_version"] = _version
        return control_data

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return self

    def __del__(self):
        self.close()

    def close(self):
        self.logger.info("Beginning Crawler exit process...")
        if "tb_driver" in dir(self):
            self.logger.info("Closing Tor Browser...")
            self.tb_driver.quit()
        if "virtual_framebuffer" in dir(self):
            self.logger.info("Closing the virtual framebuffer...")
            # A bug in pyvirtualdisplay triggers a KeyError exception when closing a
            # virtual framebuffer if the $DISPLAY environment variable is not set.
            try:
                stop_xvfb(self.virtual_framebuffer)
            except KeyError:
                pass
        if "cell_log" in dir(self):
            self.logger.info("Closing the Tor cell stream...")
            self.cell_log.close()
        if "tor_process" in dir(self):
            self.logger.info("Killing the tor process...")
            self.tor_process.kill()
        self.logger.info("Crawler exit completed.")

    def collect_onion_trace(self,
                            url,
                            hsid=None,
                            extra_fn=None,
                            trace_dir=None,
                            iteration=0):
        """Crawl an onion service and collect a complete cell sequence for the
        activity at the time. Also, record additional information about the
        circuits with stem. Optionally, pass a function to execute additional
        actions after the page has loaded."""
        # Todo: create collect_trace method that works for regular sites as
        # well
        assert ".onion" in url, ("This method is only suitable for crawling "
                                 "onion services.")

        self.logger.info("{url}: closing existing circuits before starting "
                         "crawl.".format(**locals()))
        for circuit in self.controller.get_circuits():
            self.controller.close_circuit(circuit.id)

        sleep(self.wait_after_closing_circuits)

        if not trace_dir:
            trace_dir = self.make_ts_dir()
        trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration)
        trace_path = join(trace_dir, trace_name)

        start_idx = self.get_cell_log_pos()

        try:
            self.crawl_url(url)
            rend_circ_ids = self.get_rend_circ_ids(url)
            if extra_fn:
                self.execute_extra_fn(url, trace_path, start_idx)
        except CrawlerLoggedError:
            return "failed"
        except CrawlerNoRendCircError:
            self.save_debug_log(url, trace_path, start_idx)
            return "failed"
        except:
            self.logger.exception("{url}: unusual exception "
                                  "encountered:".format(**locals()))
            # Also log active circuit info
            self.controller.get_circuits()

            exc_type, exc_value, exc_traceback = exc_info()
            if exc_type in _sketchy_exceptions:
                self.save_debug_log(url, trace_path, start_idx)
                if self.restart_on_sketchy_exception:
                    self.restart_tb()

            return "failed"

        self.logger.info("{url}: saving full trace...".format(**locals()))
        end_idx = self.get_cell_log_pos()
        full_trace = self.get_full_trace(start_idx, end_idx)

        # Save the trace to the database or write to file
        if self.db_handler:
            try:
                new_example = {
                    'hsid': hsid,
                    'crawlid': self.crawlid,
                    't_scrape': get_timestamp("db")
                }
            except NameError:
                panic("If using the database, and calling collect_onion_trace "
                      "directly, you must specify the hsid of the site.")
            exampleid = self.db_handler.add_example(new_example)
            self.db_handler.add_trace(str(full_trace), exampleid)
        else:
            with open(trace_path + "-full", "wb") as fh:
                fh.write(full_trace)

        return "succeeded"

    def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"):
        """Creates a timestamped folder to hold a group of traces."""
        raw_dirpath = join(parent_dir, raw_dir_name)
        ts = get_timestamp("log")
        ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True)
        symlink_cur_to_latest(raw_dirpath, ts)

        with open(join(ts_dir, "control.pickle"), "wb") as fh:
            pickle.dump(self.control_data, fh)

        return ts_dir

    def get_cell_log_pos(self):
        """Returns the current position of the last byte in the Tor cell log."""
        return self.cell_log.seek(0, SEEK_END)

    def crawl_url(self, url):
        """Load a web page in Tor Browser and optionally pass a function
        to execute custom actions on it."""

        self.logger.info("{url}: starting page load...".format(**locals()))

        try:
            self.tb_driver.load_url(url,
                                    wait_on_page=self.wait_on_page,
                                    wait_for_page_body=True)
        except TimeoutException:
            self.logger.warning("{url}: timed out.".format(**locals()))
            raise CrawlerLoggedError
        except http.client.CannotSendRequest:
            self.logger.warning("{url}: cannot send request--improper "
                                "connection state.".format(**locals()))
            raise CrawlerLoggedError

        # Make sure we haven't just hit an error page or nothing loaded
        try:
            if (self.tb_driver.is_connection_error_page
                    or self.tb_driver.current_url == "about:newtab"):
                raise CrawlerReachedErrorPage
        except CrawlerReachedErrorPage:
            self.logger.warning("{url}: reached connection error "
                                "page.".format(**locals()))
            raise CrawlerLoggedError

        self.logger.info("{url}: successfully loaded.".format(**locals()))

    def get_rend_circ_ids(self, url):
        """Returns the rendezvous circuit id(s) associated with a given onion
        service."""
        self.logger.info("{url}: collecting circuit "
                         "information...".format(**locals()))
        active_circs = self.controller.get_circuits()
        rend_circ_ids = set()

        for circ in active_circs:
            if (circ.purpose == "HS_CLIENT_REND" and circ.socks_username
                    and circ.socks_username in url):
                rend_circ_ids.add(circ.id)

        # If everything goes perfect, we should only see one. Multiple indicate
        # the first failed. Zero indicates one closed abruptly (or there's an
        # error with stem--still waiting on data to confirm or deny).
        rend_circ_ct = len(rend_circ_ids)
        self.logger.info(
            "{url}: {rend_circ_ct} associated rendezvous circuits "
            "discovered.".format(**locals()))
        if rend_circ_ct == 0:
            raise CrawlerNoRendCircError

        return rend_circ_ids

    def execute_extra_fn(self, url, trace_path, start_idx):
        self.logger.info("{url}: executing extra function "
                         "code...".format(**locals()))
        extra_fn(self, url, trace_path, start_idx)
        self.logger.info("{url}: extra function executed "
                         "successfully.".format(**locals()))

    def save_debug_log(self, url, trace_path, start_idx):
        self.logger.warning("{url}: saving debug log...".format(**locals()))
        exc_time = self.get_cell_log_pos()
        trace = self.get_full_trace(start_idx, exc_time)
        with open(trace_path + "@debug", "wb") as fh:
            fh.write(trace)

    def get_full_trace(self, start_idx, end_idx):
        """Returns the Tor DATA cells transmitted over a circuit during a
        specified time period."""
        # Sanity check
        assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile "
                                                "position")
        assert end_idx > start_idx, ("logfile section end_idx must come "
                                     "after start_idx")

        self.cell_log.seek(start_idx, SEEK_SET)
        return self.cell_log.read(end_idx - start_idx)

    def restart_tb(self):
        """Restarts the Tor Browser."""
        self.logger.info("Restarting the Tor Browser...")
        self.tb_driver.quit()
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=USE_RUNNING_TOR,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)
        self.logger.info("Tor Browser restarted...")

    def collect_set_of_traces(self,
                              url_set,
                              extra_fn=None,
                              trace_dir=None,
                              iteration=0,
                              shuffle=True,
                              retry=True,
                              url_to_id_mapping=None):
        """Collect a set of traces."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = url_set
            trace_dir = None
        elif not trace_dir:
            trace_dir = self.make_ts_dir()

        set_size = len(url_set)
        self.logger.info("Saving set of {set_size} traces to "
                         "{trace_dir}.".format(**locals()))

        # Converts both sets (from pickle files) and dicts (whose keys are
        # URLs--from database) to URL lists
        url_set = list(url_set)
        if shuffle:
            random.shuffle(url_set)

        failed_urls = []

        for url_idx in range(set_size):
            self.logger.info("Collecting trace {} of "
                             "{set_size}...".format(url_idx + 1, **locals()))
            url = url_set[url_idx]
            if self.db_handler:
                hsid = url_to_id_mapping[url]
            else:
                hsid = None

            if (self.collect_onion_trace(url,
                                         hsid=hsid,
                                         extra_fn=extra_fn,
                                         trace_dir=trace_dir,
                                         iteration=iteration) == "failed"
                    and retry):
                failed_urls.append(url)

        if failed_urls:
            failed_ct = len(failed_urls)
            self.logger.info("Retrying {failed_ct} of {set_size} traces that "
                             "failed.".format(**locals()))
            self.collect_set_of_traces(failed_urls,
                                       extra_fn=extra_fn,
                                       trace_dir=trace_dir,
                                       iteration=iteration,
                                       shuffle=shuffle,
                                       retry=False,
                                       url_to_id_mapping=url_to_id_mapping)

    def crawl_monitored_nonmonitored(self,
                                     monitored_class,
                                     nonmonitored_class,
                                     extra_fn=None,
                                     shuffle=True,
                                     retry=True,
                                     monitored_name="monitored",
                                     nonmonitored_name="nonmonitored",
                                     url_to_id_mapping=None,
                                     ratio=40):
        """Crawl a monitored class ratio times interspersed between the
        crawling of a(n ostensibly larger) non-monitored class."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = nonmonitored_class
                url_to_id_mapping.update(monitored_class)
            trace_dir, mon_trace_dir, nonmon_trace_dir = (None, ) * 3
        else:
            trace_dir = self.make_ts_dir()
            mon_trace_dir = join(trace_dir, monitored_name)
            mkdir(mon_trace_dir)
            nonmon_trace_dir = join(trace_dir, nonmonitored_name)
            mkdir(nonmon_trace_dir)

        # db: calling list on a dict returns a list of its keys (URLs)
        # pickle: calling list on set is necessary to make it shuffleable
        nonmonitored_class = list(nonmonitored_class)
        monitored_class = list(monitored_class)

        nonmonitored_class_ct = len(nonmonitored_class)
        chunk_size = int(nonmonitored_class_ct / ratio)

        if shuffle:
            random.shuffle(nonmonitored_class)
            random.shuffle(monitored_class)

        for iteration in range(ratio):
            self.logger.info("Beginning iteration {i} of {ratio} in the "
                             "{monitored_name} class".format(i=iteration + 1,
                                                             **locals()))
            self.collect_set_of_traces(monitored_class,
                                       trace_dir=mon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)

            slice_lb = iteration * chunk_size
            slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct)
            self.logger.info("Crawling services {} through {slice_ub} of "
                             "{nonmonitored_class_ct} in the "
                             "{nonmonitored_name} "
                             "class".format(slice_lb + 1, **locals()))
            self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub],
                                       trace_dir=nonmon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)
class Crawler:
    """Crawls your onions, but also manages Tor, drives Tor Browser, and uses
    information from your Tor cell log and stem to collect cell sequences."""
    def __init__(self,
                 take_ownership=True, # Tor dies when the Crawler does
                 torrc_config={"CookieAuth": "1"},
                 tor_log="/var/log/tor/tor.log",
                 tor_cell_log="/var/log/tor/tor_cell_seq.log",
                 control_port=9051,
                 socks_port=9050,
                 run_in_xvfb=True,
                 tbb_path=join("/opt","tbb","tor-browser_en-US"),
                 tb_log_path=join(_log_dir,"firefox.log"),
                 tb_tor_cfg=USE_RUNNING_TOR,
                 page_load_timeout=20,
                 wait_on_page=5,
                 wait_after_closing_circuits=0,
                 restart_on_sketchy_exception=True,
                 additional_control_fields={},
                 db_handler=None):

        self.logger = setup_logging(_log_dir, "crawler")
        # Set stem logging level to INFO - "high level library activity"
        stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO)

        self.torrc_config = torrc_config
        self.socks_port = find_free_port(socks_port, control_port)
        self.torrc_config.update({"SocksPort": str(self.socks_port)})
        self.control_port = find_free_port(control_port, self.socks_port)
        self.torrc_config.update({"ControlPort": str(self.control_port)})
        self.torrc_config.update({"Log": "INFO file {}".format(tor_log)})
        self.logger.info("Starting tor process with config "
                         "{torrc_config}.".format(**locals()))
        self.tor_process = launch_tor_with_config(config=self.torrc_config,
                                                  take_ownership=take_ownership)
        self.authenticate_to_tor_controlport()

        self.logger.info("Opening cell log stream...")
        self.cell_log = open(tor_cell_log, "rb")

        if run_in_xvfb:
            self.logger.info("Starting Xvfb...")
            self.run_in_xvfb = True
            self.virtual_framebuffer = start_xvfb()

        self.logger.info("Starting Tor Browser...")
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=tb_tor_cfg,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)

        self.wait_after_closing_circuits = wait_after_closing_circuits
        self.page_load_timeout = page_load_timeout
        self.tb_driver.set_page_load_timeout(page_load_timeout)
        self.wait_on_page = wait_on_page
        self.restart_on_sketchy_exception = restart_on_sketchy_exception

        self.control_data = self.get_control_data(page_load_timeout,
                                                  wait_on_page,
                                                  wait_after_closing_circuits,
                                                  additional_control_fields)
        self.db_handler = db_handler
        if db_handler:
            self.crawlid = self.db_handler.add_crawl(self.control_data)


    def authenticate_to_tor_controlport(self):
        self.logger.info("Authenticating to the tor controlport...")
        try:
            self.controller = Controller.from_port(port=self.control_port)
        except stem.SocketError as exc:
            panic("Unable to connect to tor on port {self.control_port}: "
                  "{exc}".format(**locals()))
        try:
            self.controller.authenticate()
        except stem.connection.MissingPassword:
            panic("Unable to authenticate to tor controlport. Please add "
                  "`CookieAuth 1` to your tor configuration file.")


    def get_control_data(self, page_load_timeout, wait_on_page,
                         wait_after_closing_circuits,
                         additional_control_fields):
        """Gather metadata about the crawler instance."""
        control_data = {}
        # Configuration settings
        control_data["page_load_timeout"] = page_load_timeout
        control_data["wait_on_page"] = wait_on_page
        control_data["wait_after_closing_circuits"] = \
                wait_after_closing_circuits
        if additional_control_fields:
            control_data.update(additional_control_fields)
        # System facts
        control_data["kernel"] = platform.system()
        control_data["kernel_version"] = platform.release()
        control_data["os"] = platform.version()
        control_data["python_version"] = platform.python_version()
        ip = urlopen("https://api.ipify.org").read().decode()
        control_data["ip"] = ip
        # This API seems to be unstable and we haven't found a suitable
        # alternative :(
        try:
            asn_geoip = urlopen("http://api.moocher.io/ip/{}".format(ip))
            asn_geoip = literal_eval(asn_geoip.read().decode())
            control_data["asn"] = asn_geoip.get("ip").get("as").get("asn")
            control_data["city"] = asn_geoip.get("ip").get("city")
            control_data["country"] = asn_geoip.get("ip").get("country")
        except urllib.error.HTTPError:
            self.logger.warning("Unable to query ASN API and thus some "
                                "control data may be missing from this run.")
        control_data["tor_version"] = self.controller.get_version().version_str
        control_data["tb_version"] = self.tb_driver.tb_version
        # Tor will have multiple entry nodes in its state file, but will
        # choose the first sequential one that is up as its entry guard.
        entry_nodes = self.controller.get_info("entry-guards").split('\n')
        control_data["entry_node"] = next(re.search("[0-9A-F]{40}", g).group(0)
                                          for g in entry_nodes
                                          if re.search("up", g))
        control_data["crawler_version"] = _version
        return control_data


    def __enter__(self):
        return self


    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        return self


    def __del__(self):
        self.close()


    def close(self):
        self.logger.info("Beginning Crawler exit process...")
        if "tb_driver" in dir(self):
            self.logger.info("Closing Tor Browser...")
            self.tb_driver.quit()
        if "virtual_framebuffer" in dir(self):
            self.logger.info("Closing the virtual framebuffer...")
	    # A bug in pyvirtualdisplay triggers a KeyError exception when closing a
            # virtual framebuffer if the $DISPLAY environment variable is not set.
            try:
                stop_xvfb(self.virtual_framebuffer)
            except KeyError:
                pass
        if "cell_log" in dir(self):
            self.logger.info("Closing the Tor cell stream...")
            self.cell_log.close()
        if "tor_process" in dir(self):
            self.logger.info("Killing the tor process...")
            self.tor_process.kill()
        self.logger.info("Crawler exit completed.")


    def collect_onion_trace(self, url, hsid=None, extra_fn=None, trace_dir=None,
                            iteration=0):
        """Crawl an onion service and collect a complete cell sequence for the
        activity at the time. Also, record additional information about the
        circuits with stem. Optionally, pass a function to execute additional
        actions after the page has loaded."""
        # Todo: create collect_trace method that works for regular sites as
        # well
        assert ".onion" in url, ("This method is only suitable for crawling "
                                 "onion services.")

        self.logger.info("{url}: closing existing circuits before starting "
                         "crawl.".format(**locals()))
        for circuit in self.controller.get_circuits():
            self.controller.close_circuit(circuit.id)

        sleep(self.wait_after_closing_circuits)

        if not trace_dir:
            trace_dir = self.make_ts_dir()
        trace_name = urllib.parse.quote(url, safe="") + "-" + str(iteration)
        trace_path = join(trace_dir, trace_name)

        start_idx = self.get_cell_log_pos()

        try:
            self.crawl_url(url)
            rend_circ_ids = self.get_rend_circ_ids(url)
            if extra_fn:
                self.execute_extra_fn(url, trace_path, start_idx)
        except CrawlerLoggedError:
            return "failed"
        except CrawlerNoRendCircError:
            self.save_debug_log(url, trace_path, start_idx)
            return "failed"
        except:
            self.logger.exception("{url}: unusual exception "
                                  "encountered:".format(**locals()))
            # Also log active circuit info
            self.controller.get_circuits()

            exc_type, exc_value, exc_traceback = exc_info()
            if exc_type in _sketchy_exceptions:
                self.save_debug_log(url, trace_path, start_idx)
                if self.restart_on_sketchy_exception:
                    self.restart_tb()

            return "failed"

        self.logger.info("{url}: saving full trace...".format(**locals()))
        end_idx = self.get_cell_log_pos()
        full_trace = self.get_full_trace(start_idx, end_idx)

        # Save the trace to the database or write to file
        if self.db_handler:
            try:
                new_example = {'hsid': hsid,
                               'crawlid': self.crawlid,
                               't_scrape': get_timestamp("db")}
            except NameError:
                panic("If using the database, and calling collect_onion_trace "
                      "directly, you must specify the hsid of the site.")
            exampleid = self.db_handler.add_example(new_example)
            self.db_handler.add_trace(str(full_trace), exampleid)
        else:
            with open(trace_path+"-full", "wb") as fh:
                fh.write(full_trace)

        return "succeeded"


    def make_ts_dir(self, parent_dir=_log_dir, raw_dir_name="batch"):
        """Creates a timestamped folder to hold a group of traces."""
        raw_dirpath = join(parent_dir, raw_dir_name)
        ts = get_timestamp("log")
        ts_dir = timestamp_file(raw_dirpath, ts, is_dir=True)
        symlink_cur_to_latest(raw_dirpath, ts)

        with open(join(ts_dir, "control.pickle"), "wb") as fh:
            pickle.dump(self.control_data, fh)

        return ts_dir


    def get_cell_log_pos(self):
        """Returns the current position of the last byte in the Tor cell log."""
        return self.cell_log.seek(0, SEEK_END)


    def crawl_url(self, url):
        """Load a web page in Tor Browser and optionally pass a function
        to execute custom actions on it."""

        self.logger.info("{url}: starting page load...".format(**locals()))

        try:
            self.tb_driver.load_url(url, wait_on_page=self.wait_on_page,
                                    wait_for_page_body=True)
        except TimeoutException:
            self.logger.warning("{url}: timed out.".format(**locals()))
            raise CrawlerLoggedError
        except http.client.CannotSendRequest:
            self.logger.warning("{url}: cannot send request--improper "
                                "connection state.".format(**locals()))
            raise CrawlerLoggedError

        # Make sure we haven't just hit an error page or nothing loaded
        try:
            if (self.tb_driver.is_connection_error_page
                or self.tb_driver.current_url == "about:newtab"):
                raise CrawlerReachedErrorPage
        except CrawlerReachedErrorPage:
            self.logger.warning("{url}: reached connection error "
                                "page.".format(**locals()))
            raise CrawlerLoggedError

        self.logger.info("{url}: successfully loaded.".format(**locals()))


    def get_rend_circ_ids(self, url):
        """Returns the rendezvous circuit id(s) associated with a given onion
        service."""
        self.logger.info("{url}: collecting circuit "
                         "information...".format(**locals()))
        active_circs = self.controller.get_circuits()
        rend_circ_ids = set()

        for circ in active_circs:
            if (circ.purpose == "HS_CLIENT_REND" and
                circ.socks_username and
                circ.socks_username in url):
                rend_circ_ids.add(circ.id)

        # If everything goes perfect, we should only see one. Multiple indicate
        # the first failed. Zero indicates one closed abruptly (or there's an
        # error with stem--still waiting on data to confirm or deny).
        rend_circ_ct = len(rend_circ_ids)
        self.logger.info("{url}: {rend_circ_ct} associated rendezvous circuits "
                         "discovered.".format(**locals()))
        if rend_circ_ct == 0:
            raise CrawlerNoRendCircError

        return rend_circ_ids


    def execute_extra_fn(self, url, trace_path, start_idx):
        self.logger.info("{url}: executing extra function "
                         "code...".format(**locals()))
        extra_fn(self, url, trace_path, start_idx)
        self.logger.info("{url}: extra function executed "
                         "successfully.".format(**locals()))


    def save_debug_log(self, url, trace_path, start_idx):
        self.logger.warning("{url}: saving debug log...".format(**locals()))
        exc_time = self.get_cell_log_pos()
        trace = self.get_full_trace(start_idx, exc_time)
        with open(trace_path + "@debug", "wb") as fh:
            fh.write(trace)



    def get_full_trace(self, start_idx, end_idx):
        """Returns the Tor DATA cells transmitted over a circuit during a
        specified time period."""
        # Sanity check
        assert start_idx >= 0 and end_idx > 0, ("Invalid (negative) logfile "
                                                "position")
        assert end_idx > start_idx, ("logfile section end_idx must come "
                                     "after start_idx")

        self.cell_log.seek(start_idx, SEEK_SET)
        return self.cell_log.read(end_idx - start_idx)


    def restart_tb(self):
        """Restarts the Tor Browser."""
        self.logger.info("Restarting the Tor Browser...")
        self.tb_driver.quit()
        self.tb_driver = TorBrowserDriver(tbb_path=tbb_path,
                                          tor_cfg=USE_RUNNING_TOR,
                                          tbb_logfile_path=tb_log_path,
                                          socks_port=self.socks_port,
                                          control_port=self.control_port)
        self.logger.info("Tor Browser restarted...")


    def collect_set_of_traces(self, url_set, extra_fn=None, trace_dir=None,
                              iteration=0, shuffle=True, retry=True,
                              url_to_id_mapping=None):
        """Collect a set of traces."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = url_set
            trace_dir = None
        elif not trace_dir:
                trace_dir = self.make_ts_dir()

        set_size = len(url_set)
        self.logger.info("Saving set of {set_size} traces to "
                         "{trace_dir}.".format(**locals()))

        # Converts both sets (from pickle files) and dicts (whose keys are
        # URLs--from database) to URL lists
        url_set = list(url_set)
        if shuffle:
            random.shuffle(url_set)

        failed_urls = []

        for url_idx in range(set_size):
            self.logger.info("Collecting trace {} of "
                             "{set_size}...".format(url_idx+1, **locals()))
            url = url_set[url_idx]
            if self.db_handler:
                hsid = url_to_id_mapping[url]
            else:
                hsid = None

            if (self.collect_onion_trace(url, hsid=hsid, extra_fn=extra_fn,
                                         trace_dir=trace_dir,
                                         iteration=iteration) == "failed"
                and retry):
                failed_urls.append(url)

        if failed_urls:
            failed_ct = len(failed_urls)
            self.logger.info("Retrying {failed_ct} of {set_size} traces that "
                             "failed.".format(**locals()))
            self.collect_set_of_traces(failed_urls, extra_fn=extra_fn,
                                       trace_dir=trace_dir,
                                       iteration=iteration, shuffle=shuffle,
                                       retry=False,
                                       url_to_id_mapping=url_to_id_mapping)


    def crawl_monitored_nonmonitored(self, monitored_class, nonmonitored_class,
                                     extra_fn=None, shuffle=True, retry=True,
                                     monitored_name="monitored",
                                     nonmonitored_name="nonmonitored",
                                     url_to_id_mapping=None, ratio=40):
        """Crawl a monitored class ratio times interspersed between the
        crawling of a(n ostensibly larger) non-monitored class."""
        if self.db_handler:
            if not url_to_id_mapping:
                url_to_id_mapping = nonmonitored_class
                url_to_id_mapping.update(monitored_class)
            trace_dir, mon_trace_dir, nonmon_trace_dir = (None,) * 3
        else:
            trace_dir = self.make_ts_dir()
            mon_trace_dir = join(trace_dir, monitored_name)
            mkdir(mon_trace_dir)
            nonmon_trace_dir = join(trace_dir, nonmonitored_name)
            mkdir(nonmon_trace_dir)

        # db: calling list on a dict returns a list of its keys (URLs)
        # pickle: calling list on set is necessary to make it shuffleable
        nonmonitored_class = list(nonmonitored_class)
        monitored_class = list(monitored_class)

        nonmonitored_class_ct = len(nonmonitored_class)
        chunk_size = int(nonmonitored_class_ct / ratio)

        if shuffle:
            random.shuffle(nonmonitored_class)
            random.shuffle(monitored_class)

        for iteration in range(ratio):
            self.logger.info("Beginning iteration {i} of {ratio} in the "
                             "{monitored_name} class".format(i=iteration+1,
                                                             **locals()))
            self.collect_set_of_traces(monitored_class,
                                       trace_dir=mon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)

            slice_lb = iteration * chunk_size
            slice_ub = min((iteration + 1) * chunk_size, nonmonitored_class_ct)
            self.logger.info("Crawling services {} through {slice_ub} of "
                             "{nonmonitored_class_ct} in the "
                             "{nonmonitored_name} "
                             "class".format(slice_lb + 1, **locals()))
            self.collect_set_of_traces(nonmonitored_class[slice_lb:slice_ub],
                                       trace_dir=nonmon_trace_dir,
                                       iteration=iteration,
                                       url_to_id_mapping=url_to_id_mapping)
Пример #9
0
#!/usr/bin/env python3
from tbselenium.tbdriver import TorBrowserDriver
import time

from sys import argv, exit

tbpath = "tor-browser_en-US"

if len(argv) == 1:
    website = "about:blank"
else:
    website = argv[1]

driver = TorBrowserDriver(tbpath)
driver.set_page_load_timeout(90)
try:
    driver.load_url(website)
except Exception as e:
    print(e)
    driver.quit()
    exit(1)

time.sleep(1)

driver.quit()
exit(0)
Пример #10
0
def tor_web_crawler(index, link, ip_address):
    """
    This function is a web crawler for collection of traffic traces and saving those traces to pcap files.
    :param index: current trace of the link
    :param link: webpage address from where traffic is to be collected
    :param ip_address: ip-addres of the machine from which traffic is to be collected
    :param timeout: duration upto which traffic information needs to be collected
    :param pkt_count: number of packets to be collected for a particular trace
    :return:
    """

    # Extracting domain name for saving trace separately
    url = link
    lnk = tldextract.extract(url)
    domain_name = lnk.domain + '.' + lnk.suffix
    # print('Processing trace for domain name crawl : ', domain_name)

    # interface = 'enp0s31f6'
    # interface = 'any'
    interface = 'eth0'
    cap = DesiredCapabilities().FIREFOX
    cap["marionette"] = True  # optional
    # driver = TorBrowserDriver(TBB_PATH)
    try:
        driver = TorBrowserDriver(TBB_PATH)
        # saving the pcapfiles
        PP = PACP_PATH + '/' + domain_name
        # saving the screen shots
        SS = SCREEN_SHOT + '/' + domain_name
        driver.get(url)
    except wde as e:
        print('Browser crashed:')
        print(e)
        print('Trying again in 10 seconds ...')
        time.sleep(10)
        driver = driver
        print('Success!\n')
    except Exception as e:
        raise Exception(e)

    if not os.path.isdir(PP):
        print('Creating directory for saving capture files (pcap) ...')
        os.makedirs(PP)
    else:
        pass

    if not os.path.isdir(SS):
        print('Creating directory for saving screenshots ...')
        os.makedirs(SS)
    else:
        pass

    # command to be executed for capturing the trace
    # command = "sudo tcpdump -i " + str(interface) + " -n host " + str(ip_address) + " -c " + str(pkt_count) + " -w " + PP + "/" + domain_name + "_" + str(index) + ".pcap "
    command = "sudo timeout 60 tcpdump -i " + str(
        interface) + " -n host " + str(
            ip_address) + " -w " + PP + "/" + domain_name + "_" + str(
                index) + ".pcap"
    print('Capture trace ...')
    capture = subprocess.Popen(command, shell=True)
    #     time.sleep(1)
    capture.wait()
    print('Traffic trace captured and saved successfully.')
    # save the screenshot
    driver.save_screenshot(SS + '/' + domain_name + '-' + str(index) + '.png')
    print('Screen shot of the webpage saved successfully.')
    driver.quit()
Пример #11
0
class InstagramScraper():
    """InstagramScraper: Web scraper class.

    This class is used represent the various browser types using numeric values instead
    of string values. This to allow for easy checking and changing/expanding of 
    values.
    """
    def __init__(self, browser_type, user_data_dir=None):
        # internal flag so we know what sort of web browser we are instantiating
        self.WebBrowserType = browser_type 

        # various browser initiation according to different browser types
        if (browser_type == WebBrowserType.CHROME):
            print_dbg_msg_L1("\t[+] Starting Chrome...")
            options = webdriver.chrome.options.Options()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            options.add_argument("--no-default-browser-check")
            self.browser = webdriver.Chrome(chrome_options=options)
        elif (browser_type == WebBrowserType.CHROME_DEBUG):
            print_dbg_msg_L1("\t[+] Starting Chrome in debug mode...")
            options = webdriver.chrome.options.Options()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            options.add_argument("--no-default-browser-check")
            options.add_argument("--remote-debugging-port=9222")
            if user_data_dir == None:
                user_data_dir=chrome_debug_profile + "/" + str(secrets.token_hex(16))
            #print_dbg_msg_L1("\t\t[+] User data dir: " + user_data_dir)
            if not os.path.exists(user_data_dir):
                os.makedirs(user_data_dir)
            options.add_argument("--user-data-dir=" + user_data_dir)
            self.browser = webdriver.Chrome(chrome_options=options)
        elif (browser_type == WebBrowserType.TOR):
            ''' 
            Sometimes the Tor process fails to launch or the web browser fails
            to instantiate properly. Regardless, loop until both the Tor
            process and the browser is instantiated correctly. So far, over
            30,000 runs, the instantiation usually kicks in after at most 1
            failure.
            
            '''
            while True:
                try:
                    self.tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir)
                    self.browser = TorBrowserDriver(tbb_dir, \
                        tor_cfg=cm.USE_STEM, \
                        tbb_profile_path=tbb_ff_default_dir, \
                        tbb_logfile_path=tbb_log_dir)
                except Exception as e:
                    print_dbg_msg_L1("\t[+] " + str(e))
                    print_dbg_msg_L1("\t[+] Error instantiating browser, retrying...")
                    time.sleep(1)
                    continue
                else:
                    break   
        else:
            self.browser = webdriver.Firefox()

    def get(self, targetWebAddress):
        self.browser.get(targetWebAddress)

    def close(self):
        self.browser.quit()
        if self.WebBrowserType == WebBrowserType.TOR:
            self.tor_process.kill()

    def __exit__(self, exc_type, exc_value, traceback):
        self.browser.quit()
Пример #12
0
class Visit(object):
    """Hold info about a particular visit to a page."""

    def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None,
                 experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True):
        self.batch_num = batch_num
        self.site_num = site_num
        self.instance_num = instance_num
        self.page_url = page_url
        self.bg_site = bg_site
        self.experiment = experiment
        self.base_dir = base_dir
        self.visit_dir = None
        self.visit_log_dir = None
        self.tbb_version = cm.RECOMMENDED_TBB_VERSION
        self.capture_screen = capture_screen
        self.tor_controller = tor_controller
        self.xvfb = xvfb
        self.init_visit_dir()
        self.pcap_path = os.path.join(
            self.visit_dir, "{}.pcap".format(self.get_instance_name()))

        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H))
            self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H)
            self.vdisplay.start()

        # Create new instance of TorBrowser driver
        TorBrowserDriver.add_exception(self.page_url)
        self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH,
                                          tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log"))
        self.sniffer = Sniffer()  # sniffer to capture the network traffic

    def init_visit_dir(self):
        """Create results and logs directories for this visit."""
        visit_name = str(self.instance_num)
        self.visit_dir = os.path.join(self.base_dir, visit_name)
        ut.create_dir(self.visit_dir)
        self.visit_log_dir = os.path.join(self.visit_dir, 'logs')
        ut.create_dir(self.visit_log_dir)

    def get_instance_name(self):
        """Construct and return a filename for the instance."""
        inst_file_name = '{}_{}_{}' \
            .format(self.batch_num, self.site_num, self.instance_num)
        return inst_file_name

    def filter_guards_from_pcap(self):
        guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()])
        wl_log.debug("Found %s guards in the concensus.", len(guard_ips))
        orig_pcap = self.pcap_path + ".original"
        copyfile(self.pcap_path, orig_pcap)
        try:
            preader = PcapReader(orig_pcap)
            pcap_filtered = []
            for p in preader:
                if IP not in p:
                    pcap_filtered.append(p)
                    continue
                ip = p.payload
                if ip.dst in guard_ips or ip.src in guard_ips:
                    pcap_filtered.append(p)
            wrpcap(self.pcap_path, pcap_filtered)
        except Exception as e:
            wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s",
                         e, orig_pcap)
        else:
            os.remove(orig_pcap)

    def post_crawl(self):
        pass
        # TODO: add some sanity checks?

    def cleanup_visit(self):
        """Kill sniffer and Tor browser if they're running."""
        wl_log.info("Cleaning up visit.")
        wl_log.info("Cancelling timeout")
        ut.cancel_timeout()

        if self.sniffer and self.sniffer.is_recording:
            wl_log.info("Stopping sniffer...")
            self.sniffer.stop_capture()

        # remove non-tor traffic
        self.filter_guards_from_pcap()

        if self.tb_driver and self.tb_driver.is_running:
            # shutil.rmtree(self.tb_driver.prof_dir_path)
            wl_log.info("Quitting selenium driver...")
            self.tb_driver.quit()

        # close all open streams to prevent pollution
        self.tor_controller.close_all_streams()
        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Stopping display...")
            self.vdisplay.stop()

        # after closing driver and stoping sniffer, we run postcrawl
        self.post_crawl()

    def take_screenshot(self):
        try:
            out_png = os.path.join(self.visit_dir, 'screenshot.png')
            wl_log.info("Taking screenshot of %s to %s" % (self.page_url,
                                                           out_png))
            self.tb_driver.get_screenshot_as_file(out_png)
            if cm.running_in_CI:
                wl_log.debug("Screenshot data:image/png;base64,%s"
                             % self.tb_driver.get_screenshot_as_base64())
        except:
            wl_log.info("Exception while taking screenshot of: %s"
                        % self.page_url)

    def get_wang_and_goldberg(self):
        """Visit the site according to Wang and Goldberg (WPES'13) settings."""
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to stop the visit
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)
        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {}".format(self.page_url))

        t1 = time.time()
        self.tb_driver.get(self.page_url)
        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()

    def get_multitab(self):
        """Open two tab, use one to load a background site and the other to
        load the real site."""
        PAUSE_BETWEEN_TAB_OPENINGS = 0.5
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to kill running procs
        # load a blank page - a page is needed to send keys to the browser
        self.tb_driver.get(BAREBONE_HOME_PAGE)
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)

        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {} with {} in the background".
                    format(self.page_url, self.bg_site))

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab
        # now that the focus is on the address bar, load the background
        # site by "typing" it to the address bar and "pressing" ENTER (\n)
        # simulated by send_keys function
        body.send_keys('%s\n' % self.bg_site)

        # the delay between the loading of background and real sites
        time.sleep(PAUSE_BETWEEN_TAB_OPENINGS)

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab

        t1 = time.time()
        self.tb_driver.get(self.page_url)  # load the real site in the 2nd tab

        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()

    def get(self):
        """Call the specific visit function depending on the experiment."""
        if self.experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG:
            self.get_wang_and_goldberg()
        elif self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA:
            self.get_multitab()
        else:
            raise ValueError("Cannot determine experiment type")
Пример #13
0
class FunctionalTest(object):
    gpg = None
    new_totp = None
    session_expiration = 30
    secret_message = "These documents outline a major government invasion of privacy."
    timeout = 10
    poll_frequency = 0.1

    accept_languages = None
    default_driver_name = TORBROWSER
    driver = None
    firefox_driver = None
    torbrowser_driver = None

    driver_retry_count = 3
    driver_retry_interval = 5

    def _unused_port(self):
        s = socket.socket()
        s.bind(("127.0.0.1", 0))
        port = s.getsockname()[1]
        s.close()
        return port

    def set_tbb_securitylevel(self, level):

        if level not in {SECURITY_HIGH, SECURITY_MEDIUM, SECURITY_LOW}:
            raise ValueError("Invalid Tor Browser security setting: " +
                             str(level))
        if hasattr(self, 'torbrowser_driver'):
            set_security_level(self.torbrowser_driver, level)

    def create_torbrowser_driver(self):
        logging.info("Creating TorBrowserDriver")
        log_file = open(LOGFILE_PATH, "a")
        log_file.write("\n\n[%s] Running Functional Tests\n" %
                       str(datetime.now()))
        log_file.flush()

        # Don't use Tor when reading from localhost, and turn off private
        # browsing. We need to turn off private browsing because we won't be
        # able to access the browser's cookies in private browsing mode. Since
        # we use session cookies in SD anyway (in private browsing mode all
        # cookies are set as session cookies), this should not affect session
        # lifetime.
        pref_dict = {
            "network.proxy.no_proxies_on": "127.0.0.1",
            "browser.privatebrowsing.autostart": False,
        }
        if self.accept_languages is not None:
            pref_dict["intl.accept_languages"] = self.accept_languages

        for i in range(self.driver_retry_count):
            try:
                self.torbrowser_driver = TorBrowserDriver(
                    TBB_PATH,
                    tor_cfg=cm.USE_RUNNING_TOR,
                    pref_dict=pref_dict,
                    tbb_logfile_path=LOGFILE_PATH,
                )
                logging.info("Created Tor Browser web driver")
                self.torbrowser_driver.set_window_position(0, 0)
                self.torbrowser_driver.set_window_size(1024, 1200)
                break
            except Exception as e:
                logging.error("Error creating Tor Browser web driver: %s", e)
                if i < self.driver_retry_count:
                    time.sleep(self.driver_retry_interval)

        if not self.torbrowser_driver:
            raise Exception("Could not create Tor Browser web driver")

    def create_firefox_driver(self):
        logging.info("Creating Firefox web driver")

        profile = webdriver.FirefoxProfile()
        if self.accept_languages is not None:
            profile.set_preference("intl.accept_languages",
                                   self.accept_languages)
            profile.update_preferences()

        for i in range(self.driver_retry_count):
            try:
                self.firefox_driver = webdriver.Firefox(
                    firefox_binary=FIREFOX_PATH, firefox_profile=profile)
                self.firefox_driver.set_window_position(0, 0)
                self.firefox_driver.set_window_size(1024, 1200)
                logging.info("Created Firefox web driver")
                break
            except Exception as e:
                logging.error("Error creating Firefox web driver: %s", e)
                if i < self.driver_retry_count:
                    time.sleep(self.driver_retry_interval)
        if not self.firefox_driver:
            raise Exception("Could not create Firefox web driver")

    def switch_to_firefox_driver(self):
        if not self.firefox_driver:
            self.create_firefox_driver()
        self.driver = self.firefox_driver
        logging.info("Switched %s to Firefox driver: %s", self, self.driver)

    def switch_to_torbrowser_driver(self):
        if self.torbrowser_driver is None:
            self.create_torbrowser_driver()
        self.driver = self.torbrowser_driver
        logging.info("Switched %s to TorBrowser driver: %s", self, self.driver)

    def disable_js_torbrowser_driver(self):
        if hasattr(self, 'torbrowser_driver'):
            disable_js(self.torbrowser_driver)

    def start_source_server(self, source_port):
        config.SESSION_EXPIRATION_MINUTES = self.session_expiration / 60.0

        self.source_app.run(port=source_port,
                            debug=True,
                            use_reloader=False,
                            threaded=True)

    @pytest.fixture(autouse=True)
    def set_default_driver(self):
        logging.info("Creating default web driver: %s",
                     self.default_driver_name)
        if self.default_driver_name == FIREFOX:
            self.switch_to_firefox_driver()
        else:
            self.switch_to_torbrowser_driver()

        yield

        try:
            if self.torbrowser_driver:
                self.torbrowser_driver.quit()
        except Exception as e:
            logging.error("Error stopping TorBrowser driver: %s", e)

        try:
            if self.firefox_driver:
                self.firefox_driver.quit()
        except Exception as e:
            logging.error("Error stopping Firefox driver: %s", e)

    @pytest.fixture(autouse=True)
    def sd_servers(self):
        logging.info("Starting SecureDrop servers (session expiration = %s)",
                     self.session_expiration)

        # Patch the two-factor verification to avoid intermittent errors
        logging.info("Mocking models.Journalist.verify_token")
        with mock.patch("models.Journalist.verify_token", return_value=True):
            logging.info("Mocking source_app.main.get_entropy_estimate")
            with mock.patch("source_app.main.get_entropy_estimate",
                            return_value=8192):

                try:
                    signal.signal(signal.SIGUSR1,
                                  lambda _, s: traceback.print_stack(s))

                    source_port = self._unused_port()
                    journalist_port = self._unused_port()

                    self.source_location = "http://127.0.0.1:%d" % source_port
                    self.journalist_location = "http://127.0.0.1:%d" % journalist_port

                    self.source_app = source_app.create_app(config)
                    self.journalist_app = journalist_app.create_app(config)
                    self.journalist_app.config["WTF_CSRF_ENABLED"] = True

                    self.__context = self.journalist_app.app_context()
                    self.__context.push()

                    env.create_directories()
                    db.create_all()
                    self.gpg = env.init_gpg()

                    # Add our test user
                    try:
                        valid_password = "******"
                        user = Journalist(username="******",
                                          password=valid_password,
                                          is_admin=True)
                        user.otp_secret = "JHCOGO7VCER3EJ4L"
                        db.session.add(user)
                        db.session.commit()
                    except IntegrityError:
                        logging.error("Test user already added")
                        db.session.rollback()

                    # This user is required for our tests cases to login
                    self.admin_user = {
                        "name":
                        "journalist",
                        "password": ("correct horse battery staple"
                                     " profanity oil chewy"),
                        "secret":
                        "JHCOGO7VCER3EJ4L",
                    }

                    self.admin_user["totp"] = pyotp.TOTP(
                        self.admin_user["secret"])

                    def start_journalist_server(app):
                        app.run(port=journalist_port,
                                debug=True,
                                use_reloader=False,
                                threaded=True)

                    self.source_process = Process(
                        target=lambda: self.start_source_server(source_port))

                    self.journalist_process = Process(
                        target=lambda: start_journalist_server(self.
                                                               journalist_app))

                    self.source_process.start()
                    self.journalist_process.start()

                    for tick in range(30):
                        try:
                            requests.get(self.source_location, timeout=1)
                            requests.get(self.journalist_location, timeout=1)
                        except Exception:
                            time.sleep(0.25)
                        else:
                            break
                    yield
                finally:
                    try:
                        self.source_process.terminate()
                    except Exception as e:
                        logging.error("Error stopping source app: %s", e)

                    try:
                        self.journalist_process.terminate()
                    except Exception as e:
                        logging.error("Error stopping source app: %s", e)

                    env.teardown()
                    self.__context.pop()

    def wait_for_source_key(self, source_name):
        filesystem_id = self.source_app.crypto_util.hash_codename(source_name)

        def key_available(filesystem_id):
            assert self.source_app.crypto_util.get_fingerprint(filesystem_id)

        self.wait_for(lambda: key_available(filesystem_id), timeout=60)

    def create_new_totp(self, secret):
        self.new_totp = pyotp.TOTP(secret)

    def wait_for(self, function_with_assertion, timeout=None):
        """Polling wait for an arbitrary assertion."""
        # Thanks to
        # http://chimera.labs.oreilly.com/books/1234000000754/ch20.html#_a_common_selenium_problem_race_conditions
        if timeout is None:
            timeout = self.timeout

        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                return function_with_assertion()
            except (AssertionError, WebDriverException):
                time.sleep(self.poll_frequency)
        # one more try, which will raise any errors if they are outstanding
        return function_with_assertion()

    def safe_click_by_id(self, element_id):
        """
        Clicks the element with the given ID attribute.

        Returns:
            el: The element, if found.

        Raises:
            selenium.common.exceptions.TimeoutException: If the element cannot be found in time.

        """
        el = WebDriverWait(self.driver, self.timeout,
                           self.poll_frequency).until(
                               expected_conditions.element_to_be_clickable(
                                   (By.ID, element_id)))
        el.location_once_scrolled_into_view
        el.click()
        return el

    def safe_click_by_css_selector(self, selector):
        """
        Clicks the first element with the given CSS selector.

        Returns:
            el: The element, if found.

        Raises:
            selenium.common.exceptions.TimeoutException: If the element cannot be found in time.

        """
        el = WebDriverWait(self.driver, self.timeout,
                           self.poll_frequency).until(
                               expected_conditions.element_to_be_clickable(
                                   (By.CSS_SELECTOR, selector)))
        el.click()
        return el

    def safe_click_all_by_css_selector(self, selector, root=None):
        """
        Clicks each element that matches the given CSS selector.

        Returns:
            els (list): The list of elements that matched the selector.

        Raises:
            selenium.common.exceptions.TimeoutException: If the element cannot be found in time.

        """
        if root is None:
            root = self.driver
        els = self.wait_for(
            lambda: root.find_elements_by_css_selector(selector))
        for el in els:
            clickable_el = WebDriverWait(
                self.driver, self.timeout, self.poll_frequency).until(
                    expected_conditions.element_to_be_clickable(
                        (By.CSS_SELECTOR, selector)))
            clickable_el.click()
        return els

    def safe_send_keys_by_id(self, element_id, text):
        """
        Sends the given text to the element with the specified ID.

        Returns:
            el: The element, if found.

        Raises:
            selenium.common.exceptions.TimeoutException: If the element cannot be found in time.

        """
        el = WebDriverWait(self.driver, self.timeout,
                           self.poll_frequency).until(
                               expected_conditions.element_to_be_clickable(
                                   (By.ID, element_id)))
        el.send_keys(text)
        return el

    def safe_send_keys_by_css_selector(self, selector, text):
        """
        Sends the given text to the first element with the given CSS selector.

        Returns:
            el: The element, if found.

        Raises:
            selenium.common.exceptions.TimeoutException: If the element cannot be found in time.

        """
        el = WebDriverWait(self.driver, self.timeout,
                           self.poll_frequency).until(
                               expected_conditions.element_to_be_clickable(
                                   (By.CSS_SELECTOR, selector)))
        el.send_keys(text)
        return el

    def alert_wait(self, timeout=None):
        if timeout is None:
            timeout = self.timeout * 10
        WebDriverWait(self.driver, timeout, self.poll_frequency).until(
            expected_conditions.alert_is_present(),
            "Timed out waiting for confirmation popup.")

    def alert_accept(self):
        # adapted from https://stackoverflow.com/a/34795883/837471
        def alert_is_not_present(object):
            """ Expect an alert to not be present."""
            try:
                alert = self.driver.switch_to.alert
                alert.text
                return False
            except NoAlertPresentException:
                return True

        self.driver.switch_to.alert.accept()
        WebDriverWait(self.driver, self.timeout, self.poll_frequency).until(
            alert_is_not_present,
            "Timed out waiting for confirmation popup to disappear.")
Пример #14
0
class Browser:
    
    def __init__(self, config, browser, pet, env_type, proxy_setting):

        """
        If given valid proxy settings, this function will configure socks5 proxy properly on chrome (brave) and firefox.
        """
        def setup_socks5_proxy(browser, profile, proxy_setting):
            if proxy_setting is not None:
                address = proxy_setting["address"]
                port = proxy_setting["port"]
                bypass_list = proxy_setting["bypass-list"]

                if browser == "chrome":
                    # https://sordidfellow.wordpress.com/2015/05/21/ssh-tunnel-for-chrome/
                    profile.add_argument("--proxy-server=socks5://%s:%s" % (address, port))
                    profile.add_argument("--proxy-bypass-list=%s" % bypass_list)
                    print("socks5 proxy configured on chrome")

                elif browser == "firefox":
                    # https://developer.mozilla.org/en-US/docs/Mozilla/Preferences/Mozilla_networking_preferences
                    profile.set_preference("network.proxy.type", 1)
                    profile.set_preference("network.proxy.socks", address)
                    profile.set_preference("network.proxy.socks_port", port)
                    profile.set_preference("network.proxy.socks_version", 5)
                    profile.set_preference("network.proxy.socks_remote_dns", "true")
                    profile.set_preference("network.proxy.no_proxies_on", bypass_list)
                    print("socks5 proxy configured on firefox")

        """
            If the program is run in a virtual machine, xvfbwrapper has to get installed first.        
        """
        self.env_type = env_type
        if (env_type == "vm"):
            print("xvfb")
            from xvfbwrapper import Xvfb
            width, height, depth = get_display_parameters(config)
            self.vdisplay = Xvfb(width=width, height=height, colordepth=depth)
            self.vdisplay.start()

        print("Browser:", browser, "PET:", pet)
        pet_config = PetConfig()

        if pet == "brave":
            print("brave")
            chrome_options = ChromeOptions()
            bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type)
            print(bPath, dPath)
            chromedriver = dPath
            chrome_options.binary_location = bPath
            setup_socks5_proxy("chrome", chrome_options, proxy_setting)
            os.environ["webdriver.chrome.driver"] = chromedriver
            if env_type == "vm":
                chrome_options.add_argument("--no-sandbox")
            self.driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chrome_options)
            press_enter(1)
            return

        elif pet == "tor":
            plt= platform.system().lower()
            if plt == "darwin" or plt == "windows": # https://stackoverflow.com/questions/15316304/open-tor-browser-with-selenium
                print("native tor")
                bPath, dPath = pet_config.getPetBrowserDriverPath(pet,browser,env_type)
                print(bPath, dPath)
                profile = FirefoxProfile()
                profile.set_preference("network.proxy.type", 0)
                binary = FirefoxBinary(bPath)
                self.driver = webdriver.Firefox(firefox_profile = profile, firefox_binary= binary, executable_path = dPath)
            elif plt == "linux": # https://medium.com/@manivannan_data/selenium-with-tor-browser-using-python-7b3606b8c55c
                print("vm tor")
                from tbselenium.tbdriver import TorBrowserDriver
                pref_dict = {"network.proxy.no_proxies_on": "http://10.0.2.2/, http://192.168.4.204/"}
                self.driver = TorBrowserDriver(os.environ['TBB_PATH'], pref_dict = pref_dict)
            return


        aPath, bPath, dPath, pref = pet_config.getPetBrowserDriverPath(pet,browser,env_type)
        if (browser == "firefox"):
            fp = FirefoxProfile()
            setup_socks5_proxy("firefox", fp, proxy_setting)
            binary = FirefoxBinary(bPath)
            if pref != None:
                fp.set_preference(pref[0],pref[1])
            self.driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary, executable_path=dPath)

            if (aPath):
                self.driver.install_addon(aPath)            

        elif (browser == "chrome"):
            chrome_options = ChromeOptions()
            chrome_options = webdriver.ChromeOptions() #https://github.com/SeleniumHQ/selenium/issues/5966
            setup_socks5_proxy("chrome", chrome_options, proxy_setting)

            if aPath:
                chrome_options.add_extension(aPath)
            if pref != None:
                chrome_options.add_experimental_option(pref[0],pref[1])
	        chrome_options.binary_location = bPath
            os.environ["webdriver.chrome.driver"] = dPath
	    
            time.sleep(1)
            self.driver = webdriver.Chrome(executable_path=dPath, chrome_options=chrome_options)
            # to escape the alert chrome display on first visit
            time.sleep(1)
            press_enter(1)
        elif(browser == "safari"):
            self.driver = webdriver.Safari()
        else:
            print("Unsupported Browser")
            sys.exit(0)

    def quit(self):
        try:
            self.driver.quit()
        except:
            self.driver.close()     # for Tor
        if (self.env_type == "vm"):
            self.vdisplay.stop()


    def visit_sites(self, site_list, delay=5): 
        """Visits all pages in site_list with delay"""
        for site in site_list:
            sys.stdout.write(".")
            sys.stdout.flush()
            try:
                self.driver.get(site)
                time.sleep(delay)
            except:
                print("Unexpected error:", sys.exc_info()[0])
Пример #15
0
class UntitledTestCase(unittest.TestCase):
    def setUp(self):
        print 'Loading...'

        self.display = Display(visible=0, size=(800, 600))
        self.display.start()

        self.driver = TorBrowserDriver(
            '/scratch/zilton/troll/tor-browser_pt-BR/',
            tbb_logfile_path='test.log')

        # self.driver = webdriver.Chrome('chromium-browser')
        self.base_url = "https://lemonade.ctweb.inweb.org.br/#/workflows/1/"
        self.verificationErrors = []
        self.accept_next_alert = True

    def is_visible(self, locator, timeout=20):
        try:
            ui.WebDriverWait(self.driver, timeout).until(
                ec.visibility_of_element_located((By.ID, locator)))
            return True
        except TimeoutException:
            return False

    def is_not_visible(self, locator, timeout=2):
        try:
            ui.WebDriverWait(self.driver, timeout).until_not(
                ec.visibility_of_element_located((By.ID, locator)))
            return True
        except TimeoutException:
            return False

    def test_untitled_test_case(self):
        global workflow_message_error_warning, workflow_message_completed

        driver = self.driver
        '''Login'''
        driver.get("https://lemonade.ctweb.inweb.org.br/#/login")
        driver.find_element_by_xpath("//input[@type='email']").clear()
        driver.find_element_by_xpath("//input[@type='email']").send_keys(
            lemonade_login)
        driver.find_element_by_xpath("//input[@type='password']").clear()
        driver.find_element_by_xpath("//input[@type='password']").send_keys(
            lemonade_password)
        driver.find_element_by_xpath("//button[@type='submit']").click()
        time.sleep(LOAD_TIME)

        count_progress = 1.0
        length = len(workflow_ids)
        index = 0
        count_problem = 1
        while index < length:
            workflow_id = workflow_ids[index]
            '''Access the page of the workflow'''
            url = self.base_url + str(workflow_id)
            driver.get(url)
            '''Execute the workflow'''
            while True:
                try:
                    time.sleep(LOAD_TIME * 0.2)
                    driver.find_element_by_id("tlb-execute-wf").click()
                    break
                except Exception:
                    pass

            while True:
                try:
                    time.sleep(LOAD_TIME * 0.2)
                    driver.find_element_by_id("mdl-execute-wf").click()
                    break
                except Exception:
                    pass
            '''Monitoring the status of the execution'''
            time.sleep(LOAD_TIME)
            status = WAITING_MSG
            current_url = driver.current_url

            # Workflow with problem
            if current_url == "https://lemonade.ctweb.inweb.org.br/#/" and count_problem < MAX_LOAD_PROBLEM:
                count_problem += 1
                continue
            elif count_problem == MAX_LOAD_PROBLEM:
                status = WARNING_MSG

            while (status is WAITING_MSG) or (status == RUNNING_MSG):
                while True:
                    try:
                        status = str(
                            driver.find_element_by_id("dtl-job-status").
                            get_attribute(name='title').upper())
                        if status:
                            break
                        time.sleep(LOAD_TIME * 0.2)
                    except Exception:
                        pass

                if (status == WAITING_MSG) or (status == RUNNING_MSG):
                    driver.refresh()
                    time.sleep(LOAD_TIME)
            '''Main message after the execution ends'''
            message = ''
            if status != WARNING_MSG:
                while message == '':
                    try:
                        message = driver.find_element_by_id(
                            "dtl-job-status-text").text
                        break
                    except Exception:
                        pass
                    driver.refresh()
                    time.sleep(LOAD_TIME)

            workflow_name = ''
            while True and count_problem < MAX_LOAD_PROBLEM:
                try:
                    time.sleep(LOAD_TIME * 0.2)
                    workflow_name = driver.find_element_by_xpath(
                        "//a[contains(@href, '#/workflows/1/%s')]" %
                        workflow_id).text
                    break
                except Exception:
                    pass

            if status == WARNING_MSG:
                message += ' - The execution presented an atypical problem. ' \
                           'Please check the workflow and the correct ' \
                           'update of the messages on the Lemonade page.'

            msg_dict = {
                'workflow_name': workflow_name,
                'workflow_id': workflow_id,
                'message': message,
                'status': status,
                'url': url
            }

            if status != COMPLETED_MSG:
                workflow_message_error_warning.append(msg_dict)
            else:
                workflow_message_completed += " " + workflow_id

            UntitledTestCase.update_progress(
                job_title='Testing Lemonade workflow: ',
                progress=count_progress)
            count_progress += 1

            index += 1
            count_problem = 1

        self.driver.close()

    @staticmethod
    def update_progress(job_title, progress):
        global workflow_ids
        length = len(workflow_ids)
        progress = progress / length
        block = int(round(length * progress))
        message = "\r{0}: [{1}] {2}%".format(
            job_title, ', '.join(workflow_ids[:int(progress * length)]) + "-" *
            (length - block), round(progress * 100, 2))
        if progress >= 1:
            message += " DONE\r\n"
        sys.stdout.write(message)
        sys.stdout.flush()

    def is_element_present(self, how, what):
        try:
            self.driver.find_element(by=how, value=what)
        except NoSuchElementException:
            return False
        return True

    def is_alert_present(self):
        try:
            self.driver.switch_to_alert()
        except NoAlertPresentException:
            return False
        return True

    def close_alert_and_get_its_text(self):
        try:
            alert = self.driver.switch_to_alert()
            alert_text = alert.text
            if self.accept_next_alert:
                alert.accept()
            else:
                alert.dismiss()
            return alert_text
        finally:
            self.accept_next_alert = True

    def tearDown(self):
        UntitledTestCase.sendEmail()
        self.driver.quit()
        self.display.stop()
        self.assertEqual([], self.verificationErrors)

    @staticmethod
    def sendEmail():
        global workflow_message_error_warning, workflow_message_completed

        if len(workflow_message_error_warning) > 0:
            workflow_message_completed = re.sub("^\s+|\s+$", "",
                                                workflow_message_completed)

            message = 'WORKFLOWS THAT PERFORMED CORRECTLY: %s' % (
                workflow_message_completed.replace(' ', ', '))
            message += '\n\nWORKFLOWS THAT DID NOT RUN SUCCESSFULLY:\n'
            for m in workflow_message_error_warning:
                if m['status'] == WARNING_MSG:
                    message += '\n- WORKFLOW: %s' % m['workflow_id']
                else:
                    message += '\n- WORKFLOW: %s' % m['workflow_name']
                message += '\n\tSTATUS: %s' % m['status']
                message += '\n\tMESSAGE: %s' % m['message']
                message += '\n\tURL: %s' % m['url']
                message += '\n___________________________\n'

            subject = "[LEMONADE] - Automatic Test for Workflows"

            email_sender.main(message_status_report=message.encode('utf-8'),
                              subject=subject)
class CrawlerTest(unittest.TestCase):
    def setUp(self):
        # clean dirs
        if isdir(TEST_DIRS):
            shutil.rmtree(TEST_DIRS)
        os.mkdir(TEST_DIRS)
        cm.CONFIG_FILE = os.path.join(cm.TEST_FILES_DIR, 'config.ini')
        self.config = ConfigParser.RawConfigParser()
        self.config.read(cm.CONFIG_FILE)

    def configure_crawler(self, crawl_type, config_section):
        device = netifaces.gateways()['default'][netifaces.AF_INET][1]
        tbb_dir = os.path.abspath(cm.TBB_DIR)

        # Configure controller
        torrc_config = ut.get_dict_subconfig(self.config,
                                             config_section, "torrc")
        self.controller = TorController(tbb_dir,
                                        torrc_dict=torrc_config,
                                        pollute=False)

        # Configure browser
        ffprefs = ut.get_dict_subconfig(self.config,
                                        config_section, "ffpref")
        tbb_logfile_path = os.path.join(cm.LOGS_DIR, cm.FF_LOG_FILENAME)
        socks_port = int(torrc_config['socksport'])
        self.driver = TorBrowserDriver(tbb_dir,
                                        tbb_logfile_path=tbb_logfile_path,
                                        tor_cfg=USE_RUNNING_TOR,
                                        pref_dict=ffprefs,
                                        socks_port=socks_port,
                                        canvas_allowed_hosts=[])

        # Instantiate crawler
        crawl_type = getattr(crawler_mod, "Crawler" + crawl_type)
        screenshots = True
        self.crawler = crawl_type(self.driver, self.controller,
                                  device=device, screenshots=screenshots)

        # Configure job
        self.job_config = ut.get_dict_subconfig(self.config,
                                                config_section, "job")
        # Run display
        virtual_display = ''
        self.xvfb_display = setup_virtual_display(virtual_display)

    @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI')
    def test_crawl(self):
        self.configure_crawler('Base', 'captcha_test')
        job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST)
        cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl')
        self.run_crawl(job)
        # TODO: test for more conditions...
        self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0)
        shutil.rmtree(cm.CRAWL_DIR)

    @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI')
    def test_cloudflare_captcha_page(self):
        expected_pcaps = 2

        self.configure_crawler('WebFP', 'captcha_test')

        url = 'https://cloudflare.com/'
        job = crawler_mod.CrawlJob(self.job_config, [url])
        cm.CRAWL_DIR = os.path.join(TEST_DIRS,
                                    'test_cloudflare_captcha_results')
        build_crawl_dirs()
        os.chdir(cm.CRAWL_DIR)
        try:
            self.crawler.crawl(job)  # we can pass batch and instance numbers
        finally:
            self.driver.quit()
            self.controller.quit()

        capture_dirs = glob(os.path.join(cm.CRAWL_DIR, 'captcha_*'))
        self.assertEqual(expected_pcaps, len(capture_dirs))
        shutil.rmtree(cm.CRAWL_DIR)

    @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI')
    def test_not_captcha_after_captcha(self):
        self.configure_crawler('WebFP', 'captcha_test')

        known_captcha_url = 'https://cloudflare.com'
        known_not_captcha_url = 'https://check.torproject.org/'
        urls = [known_captcha_url, known_not_captcha_url]
        job = crawler_mod.CrawlJob(self.job_config, urls)
        cm.CRAWL_DIR = os.path.join(TEST_DIRS,
                                    'test_not_captcha_after_captcha')
        self.run_crawl(job)

        for _dir in os.listdir(cm.CRAWL_DIR):
            marked_captcha = _dir.startswith('captcha_')
            is_torproject_dir = 'check.torproject.org' in _dir
            if is_torproject_dir:
                self.assertTrue(not marked_captcha)
            else:
                self.assertTrue(marked_captcha)

        shutil.rmtree(cm.CRAWL_DIR)

    @pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI')
    def test_captcha_not_captcha_2_batches(self):
        self.configure_crawler('WebFP', 'test_captcha_not_captcha_2_batches')

        known_captcha_url = 'https://cloudflare.com'
        known_not_captcha_url = 'https://check.torproject.org/'
        urls = [known_captcha_url, known_not_captcha_url]
        job = crawler_mod.CrawlJob(self.job_config, urls)
        cm.CRAWL_DIR = os.path.join(TEST_DIRS,
                                    'test_not_captcha_after_captcha')
        self.run_crawl(job)

        for _dir in os.listdir(cm.CRAWL_DIR):
            marked_captcha = _dir.startswith('captcha_')
            is_torproject_dir = 'check.torproject.org' in _dir
            if is_torproject_dir:
                self.assertTrue(not marked_captcha)
            else:
                self.assertTrue(marked_captcha)
        shutil.rmtree(cm.CRAWL_DIR)

    def test_website_in_capture_dir(self):
        self.configure_crawler('WebFP', 'captcha_test')

        url = 'https://cloudflare.com/'
        job = crawler_mod.CrawlJob(self.job_config, [url])
        cm.CRAWL_DIR = os.path.join(cm.TEST_DIR,
                                    'test_website_in_capture_dir')
        self.run_crawl(job)

        for _dir in os.listdir(cm.CRAWL_DIR):
            self.assertTrue('cloudflare.com' in _dir)
        shutil.rmtree(cm.CRAWL_DIR)

    def run_crawl(self, job):
        build_crawl_dirs()
        os.chdir(cm.CRAWL_DIR)
        try:
            self.crawler.crawl(job)  # we can pass batch and instance numbers
        finally:
            self.driver.quit()
            self.controller.quit()

    #@pytest.mark.skipif(bool(os.getenv('CI', False)), reason='Skip in CI')
    def test_middle(self):
        self.configure_crawler('Middle', 'captcha_test')
        job = crawler_mod.CrawlJob(self.job_config, TEST_URL_LIST)
        cm.CRAWL_DIR = os.path.join(TEST_DIRS, 'test_crawl')
        self.run_crawl(job)
        # TODO: test for more conditions...
        self.assertGreater(len(os.listdir(cm.CRAWL_DIR)), 0)
        shutil.rmtree(cm.CRAWL_DIR)