Exemplo n.º 1
0
async def parse_wordstat_page(page: Page) -> Tuple[list, list]:
    phrases_div, assoc_div, info_query = await asyncio.gather(
        page.waitForSelector('div.b-word-statistics__including-phrases',
                             {'visible': True}),
        page.waitForSelector('div.b-word-statistics__phrases-associations',
                             {'visible': True}),
        page.waitForSelector(
            'div.b-word-statistics__including-phrases .b-word-statistics__info-wrapper',
            {'visible': True}),
        # block Что искали со словом «"!как !объединить !ячейки !в !ворде"» — 548 показов в месяц
    )

    PARSE_WORDSTAT_TABLE_F = '''
        rows => rows.map(row => {
            const j_row = $(row);
            const query = j_row.find('a.b-phrase-link__link').text();
            const count = j_row.find('td.b-word-statistics__td-number').text().replace(/\xa0/gi, '');
            return [query, count];
        })
    '''

    phrases, assocs = await asyncio.gather(
        phrases_div.JJeval('tr + tr', PARSE_WORDSTAT_TABLE_F),
        assoc_div.JJeval('tr + tr', PARSE_WORDSTAT_TABLE_F))

    print('phrases_div - {}'.format(phrases_div))
    print(phrases, assocs)
    print('phrases - {}'.format(phrases))
    print('assocs - {}'.format(assocs))
    # info_text = await page.evaluate('(element) => element.textContent', info_query)

    # try:
    #
    #
    #
    #
    # import re
    # print('content - {}'.format(content))
    # p = re.compile('Что искали со словом (.+) — ([0-9  ]+) пока.+')  # not simple symbol space ( )
    # m = p.match(content)
    # print(m.group())
    # # 'ab'
    # print(m.group(0))
    # # 'ab'
    # exact_str = m.group(2)
    # print(exact_str)
    # exact_count = exact_str.replace(" ", "")  # not simple symbol space ( )
    #
    # print('exact_count - {} {}'.format(exact_count, exact_str))
    #
    # print('content - {}'.format(content))
    # exact_query = m.group(1)
    #
    # exact = [exact_query, exact_count]
    # await asyncio.sleep(200)
    return phrases, assocs
Exemplo n.º 2
0
    async def _download_request_with_page(self, request: Request,
                                          spider: Spider,
                                          page: Page) -> Response:
        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("pyppeteer_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)

                if self.page_coroutine_timeout is not None and not pc.kwargs.get(
                        "timeout", None):
                    pc.kwargs["timeout"] = self.page_coroutine_timeout

                if isinstance(pc, NavigationPageCoroutine):
                    await asyncio.gather(page.waitForNavigation(),
                                         method(*pc.args, **pc.kwargs))
                else:
                    pc.result = await method(*pc.args, **pc.kwargs)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        callback = request.callback or spider.parse
        annotations = getattr(callback, "__annotations__", {})
        for key, value in annotations.items():
            if value is pyppeteer.page.Page:
                request.cb_kwargs[key] = page
                self.stats.inc_value("pyppeteer/page_count/injected_callback")
                break
        else:
            if not page.isClosed():
                await page.close()
                self.stats.inc_value("pyppeteer/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["pyppeteer"],
        )
Exemplo n.º 3
0
    async def __add_page_settings(self, page: Page) -> None:
        """Add custom settings to page."""
        # Change the default maximum navigation timeout.
        if self.default_nav_timeout:
            page.setDefaultNavigationTimeout(self.default_nav_timeout)
        tasks = []
        # Blocks URLs from loading.
        if self.blocked_urls:
            self.logger.info(f"Adding {len(self.blocked_urls)} blocked urls")
            tasks.append(
                page._client.send('Network.setBlockedURLs', {
                    'urls': self.blocked_urls,
                }))
        # Disable cache for each request.
        if self.disable_cache:
            self.logger.info("Setting cache disabled.")
            tasks.append(page.setCacheEnabled(False))
        # Add a JavaScript function(s) that will be invoked whenever the page is navigated.
        if self.js_injection_scripts:
            self.logger.info(
                f"Adding {len(self.js_injection_scripts)} JavaScript injection scripts"
            )
            for script in self.js_injection_scripts:
                tasks.append(page.evaluateOnNewDocument(script))
        # Add a JavaScript functions to prevent automation detection.
        for f in Path(__file__).parent.joinpath('automation_detection').glob(
                "*.js"):
            self.logger.info(
                f"(page {page}) Adding automation detection prevention script: {f.name}"
            )
            tasks.append(page.evaluateOnNewDocument(f.read_text()))
        # Add JavaScript functions to prevent detection of headless mode.
        if self.headless:
            for f in Path(__file__).parent.joinpath('headless_detection').glob(
                    "*.js"):
                self.logger.info(
                    f"(page {page}) Adding headless detection prevention script: {f.name}"
                )
                tasks.append(page.evaluateOnNewDocument(f.read_text()))
        # Intercept all request and only allow requests for types not in self.request_abort_types.
        if self.request_abort_types:
            self.logger.info(
                f"Setting request interception for {self.request_abort_types}")
            tasks.append(page.setRequestInterception(True))

            async def block_type(request):
                if request.resourceType in self.request_abort_types:
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
Exemplo n.º 4
0
 async def _set_cookies(self, page: Page, cookies: Union[List[Dict[str, str]], Dict[str, str]]) -> None:
     """Add cookies to page."""
     if isinstance(cookies, dict):
         await page.setCookie(cookies)
     elif isinstance(cookies, (list, tuple, set)):
         await asyncio.gather(
             *[page.setCookie(cookie) for cookie in cookies])
Exemplo n.º 5
0
 async def _close_page(self, page: Page) -> None:
     logger.info(f"Removing page: {page}")
     if page in self.idle_page_q._queue:
         # remove page from idle queue.
         self.idle_page_q._queue.remove(page)
     del self.pages[page]
     try:
         # wait for page to close.
         await asyncio.wait_for(page.close(), timeout=2)
     except asyncio.TimeoutError:
         logger.warning(f"Page {page} could not be properly closed.")
Exemplo n.º 6
0
async def __try_load_contact_by_number(page: Page, target: str) -> bool:
    try:
        if int(target):
            __logger.debug("Loading contact by number.")

            page.on(
                'dialog',
                lambda dialog: asyncio.ensure_future(__accept_dialog(dialog))
            )
            await load_website(page, f"{websites['wpp_unknown']}{target}")
            time.sleep(2)
            if (await page.evaluate(f'document.querySelector("{whatsapp_selectors_dict["invalid_number_ok_button"]}") != null')):
                await page.click(whatsapp_selectors_dict["invalid_number_ok_button"])
                __logger.debug(f"Invalid number: {target}")
                print(f"Invalid Number: {target}")
                return False
            return True
    except Exception as e:
        __logger.error(f"Error loading contact by number: {str(e)}")
        return False
    return False
Exemplo n.º 7
0
 async def load_cookies(self, page: Page):
     if os.path.exists(self.cookie_path):
         with open(self.cookie_path, mode="r") as f:
             cookies = json.load(f)
         tasks = [
             asyncio.create_task(page.setCookie(c)) for c in cookies
             if c["name"] not in self.IGNORE_COOKIE_NAMES
         ]
         if len(tasks) > 0:
             await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
         LOG.info(f"Cookies for {self.password_manager.username} loaded.")
     else:
         LOG.info(
             f"Cookies for {self.password_manager.username} not yet existing."
         )
Exemplo n.º 8
0
def toService(page: Page):
    toCe(page)
    then(page.frames[1].click(
        "#pageContent > div > div.row > div:nth-child(2) > a"))
    then(page.waitFor(3000))
    then(page.frames[1].click(
        "#pageContent > div > div.row > div > div > div.tabbable.tabbable-tabdrop > ul > li:nth-child(2) > a"
    ))
    then(page.frames[1].click(
        "#allApplyInfo > div > div:nth-child(1) > label:nth-child(1) > div > span > span.selection > span"
    ))
    then(page.frames[1].type(
        "body > span > span > span.select2-search.select2-search--dropdown > input",
        "zys-pay"))
    then(page.frames[1].click(
        "#select2-select2-button-addons-single-input-group-sm-results > li:contains('zys-pay')"
    ))
Exemplo n.º 9
0
    async def _add_page_settings(self, page: Page) -> None:
        """Add custom settings to a page."""
        # add JavaScript functions to prevent automation detection.
        tasks = [
            page.evaluateOnNewDocument(
                f"() => {{{Path(__file__).parent.joinpath('stealth.min.js').read_text()}}}"
            )
        ]
        # launch options for this page.
        launch_options = self.browsers[page.browser]['launch_options']
        # set the default maximum navigation time.
        if 'defaultNavigationTimeout' in launch_options:
            page.setDefaultNavigationTimeout(
                launch_options['defaultNavigationTimeout'])
        # blocks URLs from loading.
        if 'blockedURLs' in launch_options:
            await page._client.send('Network.setBlockedURLs',
                                    {'urls': launch_options['blockedURLs']})
        # disable cache for each request.
        if 'setCacheEnabled' in launch_options:
            tasks.append(
                page.setCacheEnabled(launch_options['setCacheEnabled']))
        # add a JavaScript function(s) that will be invoked whenever the page is navigated.
        for script in launch_options.get('evaluateOnNewDocument', []):
            tasks.append(page.evaluateOnNewDocument(script))
        # intercept all request and only allow requests for types not in request_abort_types.
        request_abort_types = launch_options.get('requestAbortTypes')
        if request_abort_types:
            tasks.append(page.setRequestInterception(True))

            async def block_type(request):
                if request.resourceType in request_abort_types:
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
Exemplo n.º 10
0
    async def _add_page_settings(self, page: Page) -> None:
        """Add custom settings to a page."""
        # launch options for this page.
        launch_options = self.browsers[page.browser]['launch_options']
        # set the default maximum navigation time.
        if 'defaultNavigationTimeout' in launch_options:
            page.setDefaultNavigationTimeout(
                launch_options['defaultNavigationTimeout'])
        tasks = [self.set_stealth(page)]
        # blocks URLs from loading.
        if 'blockedURLs' in launch_options:
            tasks.append(
                self.set_blocked_urls(page, launch_options['blockedURLs']))
        # disable cache for each request.
        if 'setCacheEnabled' in launch_options:
            tasks.append(
                page.setCacheEnabled(launch_options['setCacheEnabled']))
        # add a JavaScript function(s) that will be invoked whenever the page is navigated.
        for script in launch_options.get('evaluateOnNewDocument', []):
            tasks.append(page.evaluateOnNewDocument(script))
        # intercept all request and only allow requests for types not in request_abort_types.
        request_abort_types = launch_options.get('requestAbortTypes')
        if request_abort_types:
            # enable request interception.
            tasks.append(page.setRequestInterception(True))

            async def block_type(request: Request):
                # condition(s) where requests should be aborted.
                if request.resourceType in request_abort_types:
                    await request.abort()
                elif launch_options.get(
                        'blockRedirects',
                        False) and request.isNavigationRequest() and len(
                            request.redirectChain):
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
Exemplo n.º 11
0
 def _add_page_listeners(self, page: Page) -> None:
     syncer.sync(self._page.setRequestInterception(True))
     page.on('request', self._on_request)
     page.on('response', self._on_response)
Exemplo n.º 12
0
 async def close_page(self, page: Page) -> None:
     """Attempt to close a page."""
     try:
         await asyncio.wait_for(page.close(), timeout=2)
     except Exception:
         self.logger.warning(f"Page {page} could not be properly closed.")
Exemplo n.º 13
0
def toCe(page: Page):
    toIdc(page)
    then(page.goto("http://cloudengine.yunzong.me:10470"))
Exemplo n.º 14
0
def toIdc(page: Page):
    then(page.goto("http://idcenter.box.zonghengke.com"))
    if len(then(page.JJ("#in_user_Nm"))) > 0:
        then(page.type("#in_user_Nm", "gaowenbo"))
        then(page.type("#in_password", "YKUacrVjlfoR"))
        then(page.click("#sign_in"))
Exemplo n.º 15
0
def toIdc(driver: page.Page):
    driver.goto("https://www.baidu.com")