Exemplo n.º 1
0
    def test_tor_browser_with_no_option(self, config, tor_proxy):
        tor_browser = TorBrowser(
            config=config,
            url=self.target_url,
            proxy=tor_proxy,
            use_proxy_type="tor",
            explicit_wait_duration=0,
        )

        # Try to setup the browser
        tor_browser.setup()
Exemplo n.º 2
0
    def test_tor_browser_with_wrong_security_option(self, config, tor_proxy):
        tor_browser = TorBrowser(
            config=config,
            url=self.target_url,
            proxy=tor_proxy,
            use_proxy_type="tor",
            explicit_wait_duration=0,
            options={"tbb_security_level": None},
        )

        # Try to setup the browser
        tor_browser.setup()
Exemplo n.º 3
0
    def test_tor_browser_with_http_proxy(self, config, http_proxy):
        tor_browser = TorBrowser(
            config=config,
            url=self.target_url,
            proxy=http_proxy,
            use_proxy_type="http",
            explicit_wait_duration=0,
        )

        tor_browser.setup()
        tor_browser.connect()
        tor_browser.fetch()

        assert http_proxy[0] in tor_browser.page_source

        tor_browser.close()
Exemplo n.º 4
0
    def test_tor_browser_security_level_safer(self, config, tor_proxy):
        tor_browser = TorBrowser(
            config=config,
            url=self.target_url,
            proxy=tor_proxy,
            use_proxy_type="tor",
            explicit_wait_duration=0,
            options={"tbb_security_level": "safer"},
        )

        tor_browser.setup()
        tor_browser.connect()
        tor_browser.fetch()

        assert (
            "Congratulations. This browser is configured to use Tor."
            in tor_browser.page_source
        )

        tor_browser.close()
Exemplo n.º 5
0
    def test_tor_browser_wrong_profile_location(self, config, tor_proxy):
        test_config = deep_copy(config)

        # Modify the profile directory
        test_config[
            "docker_tor_browser_container_profile_location"
        ] = "obviously_wrong_directory"

        tor_browser = TorBrowser(
            config=test_config,
            url=self.target_url,
            proxy=tor_proxy,
            use_proxy_type="tor",
            explicit_wait_duration=0,
            options={"tbb_security_level": "standard"},
        )

        # Check if the exception is correct
        with pytest.raises(TorBrowserProfileLocationError):
            tor_browser.setup()
Exemplo n.º 6
0
    def process_next_job(self) -> None:
        """
        Processes the next available job in the job queue. Claims the job, tries
        fetching the URL specified in the job with the specified fetcher. If
        successfull, inserts the results into the FetchCompleted table. Otherwise,
        inserts the results into the FetchFailed table. Finally, removes the
        claimed job from the queue.

        :raises FetcherNotFound: If requested fetcher is not available
        """
        # pylint: disable=R0912,R0915
        # Get claimed jobs by this worker
        db_job = self.__db_session.query(FetchQueue).filter(
            FetchQueue.claimed_by == self.__worker_id
        )

        # Claim a new job if not already claimed
        if db_job.count() == 0:
            # TODO: Yes, the following is a bad practice, please use an ORM statement instead
            table = FetchQueue.__tablename__.lower()
            query = f"UPDATE {table} SET claimed_by = :worker_id WHERE id = (SELECT min(id) FROM {table} WHERE claimed_by IS NULL)"
            params = {"worker_id": self.__worker_id}
            self.__db_session.execute(text(query), params)
            self.__db_session.commit()

        # Get the claimed job
        job = db_job.first()

        # Don't do anything if there is no job in the queue
        if job is None:
            return

        try:
            # Create the options based on the ones described within the job
            options_dict = {}
            if job.options is not None:
                options_dict = job.options

            # Create a new circuit if we will be using Tor
            proxy = None
            if job.ref_fetcher.uses_proxy_type == "tor":
                self.__tor_launcher.create_new_circuit_to(job.ref_relay.fingerprint)
                proxy = (
                    self.__tor_launcher.ip_address,
                    self.__tor_launcher.socks_port,
                )

            # Check if the proxy type is http, if so: add host and port into the proxy tuple
            elif job.ref_fetcher.uses_proxy_type == "http":
                proxy = (job.ref_proxy.host, job.ref_proxy.port)

            # Fetch it using a fetcher
            if job.ref_fetcher.method == TorBrowser.method_name_in_db:
                options_dict.update({"tbb_security_level": job.tbb_security_level})
                self.__fetcher = TorBrowser(
                    config=self.__config,
                    url=job.url,
                    proxy=proxy,
                    options=options_dict,
                    use_proxy_type=job.ref_fetcher.uses_proxy_type,
                )

            elif job.ref_fetcher.method == FirefoxBrowser.method_name_in_db:
                self.__fetcher = FirefoxBrowser(
                    config=self.__config,
                    url=job.url,
                    proxy=proxy,
                    options=options_dict,
                    use_proxy_type=job.ref_fetcher.uses_proxy_type,
                )

            elif job.ref_fetcher.method == ChromeBrowser.method_name_in_db:
                self.__fetcher = ChromeBrowser(
                    config=self.__config,
                    url=job.url,
                    proxy=proxy,
                    options=options_dict,
                    use_proxy_type=job.ref_fetcher.uses_proxy_type,
                )

            elif job.ref_fetcher.method == OperaBrowser.method_name_in_db:
                self.__fetcher = OperaBrowser(
                    config=self.__config,
                    url=job.url,
                    proxy=proxy,
                    options=options_dict,
                    use_proxy_type=job.ref_fetcher.uses_proxy_type,
                )

            else:
                raise FetcherNotFound

            self.__fetcher.setup()
            self.__fetcher.connect()
            self.__fetcher.fetch()

        # pylint: disable=W0703
        except Exception:
            error = get_traceback_information()

            # If failed, put into the failed table
            failed = FetchFailed(
                url=job.url,
                options=options_dict,
                tbb_security_level=job.tbb_security_level,
                captcha_monitor_version=self.__config["version"],
                fail_reason=str(error),
                fetcher_id=job.fetcher_id,
                domain_id=job.domain_id,
                relay_id=job.relay_id,
                proxy_id=job.proxy_id,
            )
            self.__db_session.add(failed)
            self.__logger.debug(
                "Worker %s wasn't able to fetch %s with %s: %s",
                self.__worker_id,
                job.url,
                job.fetcher_id,
                str(error),
            )

            # If fetcher was initialized correctly
            if hasattr_private(self, "__fetcher"):
                # Check if container is healthy
                self.__logger.debug("Checking if the container is healthy")
                ContainerManager(
                    self.__fetcher.container_host
                ).restart_browser_container_if_unhealthy()

        else:
            # If successful, put into the completed table
            completed = FetchCompleted(
                url=job.url,
                options=options_dict,
                tbb_security_level=job.tbb_security_level,
                captcha_monitor_version=self.__config["version"],
                html_data=self.__fetcher.page_source,
                http_requests=self.__fetcher.page_har,
                fetcher_id=job.fetcher_id,
                domain_id=job.domain_id,
                relay_id=job.relay_id,
                proxy_id=job.proxy_id,
            )
            self.__db_session.add(completed)
            self.__logger.debug(
                "Worker %s successfully fetched %s with %s",
                self.__worker_id,
                job.url,
                job.fetcher_id,
            )

        finally:
            # Close the fetcher
            if hasattr_private(self, "__fetcher"):
                self.__fetcher.close()

            # Reset the changes
            self.__tor_launcher.reset_configuration()

            # Delete job from the job queue
            self.__db_session.delete(job)

            # Commit changes to the database
            self.__db_session.commit()