예제 #1
0
    async def __add_page_settings(self, page: Page) -> None:
        """Add custom settings to page."""
        # Change the default maximum navigation timeout.
        if self.default_nav_timeout:
            page.setDefaultNavigationTimeout(self.default_nav_timeout)
        tasks = []
        # Blocks URLs from loading.
        if self.blocked_urls:
            self.logger.info(f"Adding {len(self.blocked_urls)} blocked urls")
            tasks.append(
                page._client.send('Network.setBlockedURLs', {
                    'urls': self.blocked_urls,
                }))
        # Disable cache for each request.
        if self.disable_cache:
            self.logger.info("Setting cache disabled.")
            tasks.append(page.setCacheEnabled(False))
        # Add a JavaScript function(s) that will be invoked whenever the page is navigated.
        if self.js_injection_scripts:
            self.logger.info(
                f"Adding {len(self.js_injection_scripts)} JavaScript injection scripts"
            )
            for script in self.js_injection_scripts:
                tasks.append(page.evaluateOnNewDocument(script))
        # Add a JavaScript functions to prevent automation detection.
        for f in Path(__file__).parent.joinpath('automation_detection').glob(
                "*.js"):
            self.logger.info(
                f"(page {page}) Adding automation detection prevention script: {f.name}"
            )
            tasks.append(page.evaluateOnNewDocument(f.read_text()))
        # Add JavaScript functions to prevent detection of headless mode.
        if self.headless:
            for f in Path(__file__).parent.joinpath('headless_detection').glob(
                    "*.js"):
                self.logger.info(
                    f"(page {page}) Adding headless detection prevention script: {f.name}"
                )
                tasks.append(page.evaluateOnNewDocument(f.read_text()))
        # Intercept all request and only allow requests for types not in self.request_abort_types.
        if self.request_abort_types:
            self.logger.info(
                f"Setting request interception for {self.request_abort_types}")
            tasks.append(page.setRequestInterception(True))

            async def block_type(request):
                if request.resourceType in self.request_abort_types:
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
예제 #2
0
    async def _add_page_settings(self, page: Page) -> None:
        """Add custom settings to a page."""
        # launch options for this page.
        launch_options = self.browsers[page.browser]['launch_options']
        # set the default maximum navigation time.
        if 'defaultNavigationTimeout' in launch_options:
            page.setDefaultNavigationTimeout(
                launch_options['defaultNavigationTimeout'])
        tasks = [self.set_stealth(page)]
        # blocks URLs from loading.
        if 'blockedURLs' in launch_options:
            tasks.append(
                self.set_blocked_urls(page, launch_options['blockedURLs']))
        # disable cache for each request.
        if 'setCacheEnabled' in launch_options:
            tasks.append(
                page.setCacheEnabled(launch_options['setCacheEnabled']))
        # add a JavaScript function(s) that will be invoked whenever the page is navigated.
        for script in launch_options.get('evaluateOnNewDocument', []):
            tasks.append(page.evaluateOnNewDocument(script))
        # intercept all request and only allow requests for types not in request_abort_types.
        request_abort_types = launch_options.get('requestAbortTypes')
        if request_abort_types:
            # enable request interception.
            tasks.append(page.setRequestInterception(True))

            async def block_type(request: Request):
                # condition(s) where requests should be aborted.
                if request.resourceType in request_abort_types:
                    await request.abort()
                elif launch_options.get(
                        'blockRedirects',
                        False) and request.isNavigationRequest() and len(
                            request.redirectChain):
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
예제 #3
0
파일: spider.py 프로젝트: bminossi/distbot
    async def _add_page_settings(self, page: Page) -> None:
        """Add custom settings to a page."""
        # add JavaScript functions to prevent automation detection.
        tasks = [
            page.evaluateOnNewDocument(
                f"() => {{{Path(__file__).parent.joinpath('stealth.min.js').read_text()}}}"
            )
        ]
        # launch options for this page.
        launch_options = self.browsers[page.browser]['launch_options']
        # set the default maximum navigation time.
        if 'defaultNavigationTimeout' in launch_options:
            page.setDefaultNavigationTimeout(
                launch_options['defaultNavigationTimeout'])
        # blocks URLs from loading.
        if 'blockedURLs' in launch_options:
            await page._client.send('Network.setBlockedURLs',
                                    {'urls': launch_options['blockedURLs']})
        # disable cache for each request.
        if 'setCacheEnabled' in launch_options:
            tasks.append(
                page.setCacheEnabled(launch_options['setCacheEnabled']))
        # add a JavaScript function(s) that will be invoked whenever the page is navigated.
        for script in launch_options.get('evaluateOnNewDocument', []):
            tasks.append(page.evaluateOnNewDocument(script))
        # intercept all request and only allow requests for types not in request_abort_types.
        request_abort_types = launch_options.get('requestAbortTypes')
        if request_abort_types:
            tasks.append(page.setRequestInterception(True))

            async def block_type(request):
                if request.resourceType in request_abort_types:
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)