示例#1
0
 def test_crash_profile(self):
     manager_params, browser_params = self.get_config()
     manager_params.failure_limit = 2
     manager = TaskManager(manager_params, browser_params)
     try:
         manager.get("http://example.com")  # So we have a profile
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Requires two commands to shut down
     except CommandExecutionError:
         pass
     assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
示例#2
0
    def test_profile_saved_when_launch_crashes(self):
        manager_params, browser_params = self.get_config()
        browser_params[0].proxy = True
        browser_params[0].save_content = "script"
        manager = TaskManager(manager_params, browser_params)
        manager.get("http://example.com")

        # Kill the LevelDBAggregator
        # This will cause the proxy launch to crash
        manager.ldb_status_queue.put("DIE")
        manager.browsers[0]._SPAWN_TIMEOUT = 2  # Have timeout occur quickly
        manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2  # Quick timeout
        manager.get("example.com")  # Cause a selenium crash

        # The browser will fail to launch due to the proxy crashes
        try:
            manager.get("http://example.com")
        except CommandExecutionError:
            pass
        manager.close()
        assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
示例#3
0
def test_custom_function(default_params, xpi, server):
    """ Test `custom_function` with an inline func that collects links """
    table_name = TableName("page_links")

    manager_params, browser_params = default_params
    path = manager_params.data_directory / "crawl-data.sqlite"
    db = sqlite3.connect(path)
    cur = db.cursor()

    cur.execute(
        """CREATE TABLE IF NOT EXISTS %s (
            top_url TEXT, link TEXT,
            visit_id INTEGER, browser_id INTEGER);"""
        % table_name
    )
    cur.close()
    db.close()

    storage_provider = SQLiteStorageProvider(path)
    manager = TaskManager(manager_params, browser_params, storage_provider, None)
    cs = command_sequence.CommandSequence(url_a)
    cs.get(sleep=0, timeout=60)
    cs.append_command(CollectLinksCommand(table_name, "http"))
    manager.execute_command_sequence(cs)
    manager.close()
    query_result = db_utils.query_db(
        path,
        "SELECT top_url, link FROM page_links;",
        as_tuple=True,
    )
    assert PAGE_LINKS == set(query_result)
示例#4
0
 def test_saving(self):
     manager_params, browser_params = self.get_config()
     manager = TaskManager(manager_params, browser_params)
     manager.get("http://example.com")
     manager.close()
     assert isfile(
         join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
示例#5
0
 def _create_task_manager(
     params: Tuple[ManagerParams, List[BrowserParams]]
 ) -> Tuple[TaskManager, Path]:
     manager_params, browser_params = params
     db_path = manager_params.data_directory / "crawl-data.sqlite"
     structured_provider = SQLiteStorageProvider(db_path)
     manager = TaskManager(
         manager_params,
         browser_params,
         structured_provider,
         None,
     )
     return manager, db_path
示例#6
0
 def test_crash(self):
     manager_params, browser_params = self.get_config()
     manager_params.failure_limit = 0
     manager = TaskManager(manager_params, browser_params)
     with pytest.raises(CommandExecutionError):
         manager.get("http://example.com")  # So we have a profile
         manager.get("example.com")  # Selenium requires scheme prefix
         manager.get("example.com")  # Requires two commands to shut down
示例#7
0
    def test_display_shutdown(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)
        port = manager.browsers[0].display_port

        sequence = CommandSequence(TEST_SITE)
        sequence.get()
        sequence.append_command(ExceptionCommand)
        manager.execute_command_sequence(sequence)
        manager.close()
        assert not os.path.exists("/tmp/.X%s-lock" % port)
示例#8
0
    def test_local_callbacks(self):
        manager_params, browser_params = self.get_config()
        TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html"
        manager = TaskManager(manager_params, browser_params)

        def callback(argument: List[int], success: bool):
            argument.extend([1, 2, 3])

        my_list = []
        sequence = CommandSequence(
            TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list)
        )
        sequence.get()

        manager.execute_command_sequence(sequence)
        manager.close()
        assert my_list == [1, 2, 3]
示例#9
0
 def test_seed_persistance(self):
     manager_params, browser_params = self.get_test_config(num_browsers=1)
     browser_params[0].seed_tar = "."
     command_sequences = []
     for _ in range(2):
         cs = CommandSequence(url="https://example.com", reset=True)
         cs.get()
         cs.append_command(TestConfigSetCommand("test_pref", True))
         command_sequences.append(cs)
     manager = TaskManager(manager_params, browser_params)
     for cs in command_sequences:
         manager.execute_command_sequence(cs)
     manager.close()
     query_result = db_utils.query_db(
         manager_params.database_name,
         "SELECT * FROM crawl_history;",
     )
     assert len(query_result) > 0
     for row in query_result:
         assert row[
             "command_status"] == "ok", f"Command {tuple(row)} was not ok"
    browser_params[i].callstack_instrument = True
    # Record DNS resolution
    browser_params[i].dns_instrument = True

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = "~/Desktop/"
manager_params.log_directory = "~/Desktop/"

# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(manager_params, browser_params)

# Visits the sites
for site in sites:

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)),
    )

    # Start by visiting the page
    command_sequence.append_command(GetCommand(url=site, sleep=10), timeout=60)
    # Have a look at custom_command.py to see how to implement your own command
示例#11
0
def processSite(site):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 1
    sites = [site]

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # memory_watchdog and process_watchdog are useful for large scale cloud crawls.
    # Please refer to docs/Configuration.md#platform-configuration-options for more information
    # manager_params.memory_watchdog = True
    # manager_params.process_watchdog = True

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                callback=callback,
            )
            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)
            # Have a look at custom_command.py to see how to implement your own command
            command_sequence.append_command(LinkCountingCommand())
            # Run commands across all browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)
    return (None)
示例#12
0
    bucket_name=GCS_BUCKET,
    base_path=CRAWL_DIRECTORY,
    token=AUTH_TOKEN,
)
unstructured = GcsUnstructuredProvider(
    project=GCP_PROJECT,
    bucket_name=GCS_BUCKET,
    base_path=CRAWL_DIRECTORY + "/data",
    token=AUTH_TOKEN,
)
# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager(
    manager_params,
    browser_params,
    structured,
    unstructured,
    logger_kwargs=LOGGER_SETTINGS,
)

# At this point, Sentry should be initiated
if SENTRY_DSN:
    # Add crawler.py-specific context
    with sentry_sdk.configure_scope() as scope:
        # tags generate breakdown charts and search filters
        scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY)
        scope.set_tag("GCS_BUCKET", GCS_BUCKET)
        scope.set_tag("DISPLAY_MODE", DISPLAY_MODE)
        scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT)
        scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT)
        scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT)
示例#13
0
    # Record DNS resolution
    browser_params[i].dns_instrument = True

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params.data_directory = Path("./datadir/")
manager_params.log_directory = Path("./datadir/")

# memory_watchdog and process_watchdog are useful for large scale cloud crawls.
# Please refer to docs/Configuration.md#platform-configuration-options for more information
# manager_params.memory_watchdog = True
# manager_params.process_watchdog = True

# Commands time out by default after 60 seconds
with TaskManager(
        manager_params,
        browser_params,
        SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")),
        None,
) as manager:
    # Visits the sites
    for index, site in enumerate(sites):

        def callback(success: bool, val: str = site) -> None:
            print(
                f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
            )

        # Parallelize sites over all number of browsers set above.
        command_sequence = CommandSequence(
            site,
            site_rank=index,
            callback=callback,
示例#14
0
 def test_profile_error(self):
     manager_params, browser_params = self.get_config()
     browser_params[0].seed_tar = "/tmp/NOTREAL"
     with pytest.raises(ProfileLoadError):
         TaskManager(manager_params, browser_params)  # noqa
示例#15
0
def crawl(sites, db_filename):
    '''
    sites ihe list of sites that we wish to crawl
    db_filename is the file name of the output database
    '''

    # Loads the default ManagerParams
    # and NUM_BROWSERS copies of the default BrowserParams
    NUM_BROWSERS = 12

    manager_params = ManagerParams(num_browsers=NUM_BROWSERS)
    browser_params = [
        BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS)
    ]

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i].http_instrument = True
        # Record cookie changes
        browser_params[i].cookie_instrument = True
        # Record Navigations
        browser_params[i].navigation_instrument = True
        # Record JS Web API calls
        browser_params[i].js_instrument = True
        # Record the callstack of all WebRequests made
        browser_params[i].callstack_instrument = True
        # Record DNS resolution
        browser_params[i].dns_instrument = True

        browser_params[i].bot_mitigation = True

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params.data_directory = Path("./datadir/")
    manager_params.log_directory = Path("./datadir/")

    # Commands time out by default after 60 seconds
    with TaskManager(
            manager_params,
            browser_params,
            SQLiteStorageProvider(
                Path("./datadir/{}.sqlite".format(db_filename))),
            None,
    ) as manager:
        # Visits the sites
        for index, site in enumerate(sites):

            def callback(success: bool, val: str = site) -> None:
                print(
                    f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}"
                )

            # Parallelize sites over all number of browsers set above.
            command_sequence = CommandSequence(
                site,
                site_rank=index,
                reset=True,
                callback=callback,
            )

            # Start by visiting the page
            command_sequence.append_command(GetCommand(url=site, sleep=3),
                                            timeout=60)

            # Run commands across the three browsers (simple parallelization)
            manager.execute_command_sequence(command_sequence)