Exemplo n.º 1
0
    def browse_page(
            self, page_url, extra_headers=None,
            user_agent=None, behavior_parameters=None, behaviors_dir=None,
            on_request=None, on_response=None, on_screenshot=None,
            username=None, password=None, hashtags=None,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                        extra_headers=extra_headers,
                        user_agent=user_agent)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                if on_screenshot:
                    self._try_screenshot(on_screenshot)
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters,
                        behaviors_dir=behaviors_dir)
                self.run_behavior(behavior_script, timeout=behavior_timeout)
                if skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None
Exemplo n.º 2
0
    def browse_page(self,
                    page_url,
                    extra_headers=None,
                    user_agent=None,
                    behavior_parameters=None,
                    behaviors_dir=None,
                    on_request=None,
                    on_response=None,
                    on_service_worker_version_updated=None,
                    on_screenshot=None,
                    username=None,
                    password=None,
                    hashtags=None,
                    screenshot_full_page=False,
                    skip_extract_outlinks=False,
                    skip_visit_hashtags=False,
                    skip_youtube_dl=False,
                    simpler404=False,
                    page_timeout=300,
                    behavior_timeout=900,
                    extract_outlinks_timeout=60,
                    download_throughput=-1,
                    stealth=False):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_service_worker_version_updated: callback to invoke on every
                ServiceWorker.workerVersionUpdated event, takes one argument,
                the json-decoded message (default None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)
            username: username string to use to try logging in if a login form
                is found in the page (default None)
            password: password string to use to try logging in if a login form
                is found in the page (default None)
            ... (there are more)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
            self.websock_thread.on_service_worker_version_updated = \
                    on_service_worker_version_updated
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(extra_headers=extra_headers,
                                       user_agent=user_agent,
                                       download_throughput=download_throughput,
                                       stealth=stealth)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                # If the target page HTTP status is 4xx/5xx, there is no point
                # in running behaviors, outlink and hashtag extraction as we
                # didn't get a valid page. Screenshot should run because i
                # may be useful to have a picture of the error page.
                # This is only enabled with option `simpler404`.
                run_behaviors = True
                if simpler404 and (self.websock_thread.page_status is None
                                   or self.websock_thread.page_status >= 400):
                    run_behaviors = False

                if run_behaviors and behavior_timeout > 0:
                    behavior_script = brozzler.behavior_script(
                        page_url,
                        behavior_parameters,
                        behaviors_dir=behaviors_dir)
                    self.run_behavior(behavior_script,
                                      timeout=behavior_timeout)
                final_page_url = self.url()
                if on_screenshot:
                    self._try_screenshot(on_screenshot, screenshot_full_page)
                if not run_behaviors or skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks(
                        timeout=extract_outlinks_timeout)
                if run_behaviors and not skip_visit_hashtags:
                    self.visit_hashtags(final_page_url, hashtags, outlinks)
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None
Exemplo n.º 3
0
    def browse_page(
            self, page_url, extra_headers=None,
            user_agent=None, behavior_parameters=None, behaviors_dir=None,
            on_request=None, on_response=None,
            on_service_worker_version_updated=None, on_screenshot=None,
            username=None, password=None, hashtags=None,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            behaviors_dir: Directory containing behaviors.yaml and JS templates
                (default None loads Brozzler default JS behaviors)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_service_worker_version_updated: callback to invoke on every
                ServiceWorker.workerVersionUpdated event, takes one argument,
                the json-decoded message (default None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)
            username: username string to use to try logging in if a login form
                is found in the page (default None)
            password: password string to use to try logging in if a login form
                is found in the page (default None)
            ... (there are more)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            brozzler.ProxyError: in case of proxy connection error
            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        if on_request:
            self.websock_thread.on_request = on_request
        if on_response:
            self.websock_thread.on_response = on_response
        if on_service_worker_version_updated:
            self.websock_thread.on_service_worker_version_updated = \
                    on_service_worker_version_updated
        try:
            with brozzler.thread_accept_exceptions():
                self.configure_browser(
                        extra_headers=extra_headers,
                        user_agent=user_agent)
                self.navigate_to_page(page_url, timeout=page_timeout)
                if password:
                    self.try_login(username, password, timeout=page_timeout)
                    # if login redirected us, return to page_url
                    if page_url != self.url().split('#')[0]:
                        self.logger.debug(
                            'login navigated away from %s; returning!',
                            page_url)
                        self.navigate_to_page(page_url, timeout=page_timeout)
                if on_screenshot:
                    self._try_screenshot(on_screenshot)
                behavior_script = brozzler.behavior_script(
                        page_url, behavior_parameters,
                        behaviors_dir=behaviors_dir)
                self.run_behavior(behavior_script, timeout=behavior_timeout)
                if skip_extract_outlinks:
                    outlinks = []
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
            # websock_thread has stashed the ReachedLimit exception with
            # more information, raise that one
            raise self.websock_thread.reached_limit
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None
Exemplo n.º 4
0
    def browse_page(
            self, page_url, ignore_cert_errors=False, extra_headers=None,
            user_agent=None, behavior_parameters=None,
            on_request=None, on_response=None, on_screenshot=None):
        '''
        Browses page in browser.

        Browser should already be running, i.e. start() should have been
        called. Opens the page_url in the browser, runs behaviors, takes a
        screenshot, extracts outlinks.

        Args:
            page_url: url of the page to browse
            extra_headers: dict of extra http headers to configure the browser
                to send with every request (default None)
            user_agent: user agent string, replaces browser default if
                supplied (default None)
            behavior_parameters: dict of parameters for populating the
                javascript behavior template (default None)
            on_request: callback to invoke on every Network.requestWillBeSent
                event, takes one argument, the json-decoded message (default
                None)
            on_response: callback to invoke on every Network.responseReceived
                event, takes one argument, the json-decoded message (default
                None)
            on_screenshot: callback to invoke when screenshot is obtained,
                takes one argument, the the raw jpeg bytes (default None)
                # XXX takes two arguments, the url of the page at the time the
                # screenshot was taken, and the raw jpeg bytes (default None)

        Returns:
            A tuple (final_page_url, outlinks).
            final_page_url: the url in the location bar at the end of the
                browse_page cycle, which could be different from the original
                page url if the page redirects, javascript has changed the url
                in the location bar, etc
            outlinks: a list of navigational links extracted from the page

        Raises:
            BrowsingException: if browsing the page fails
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
        if self.is_browsing:
            raise BrowsingException('browser is already busy browsing a page')
        self.is_browsing = True
        try:
            self._browser_controller.navigate_to_page(page_url, timeout=300)
            ## if login_credentials:
            ##     self._browser_controller.try_login(login_credentials) (5 min?)
            behavior_script = brozzler.behavior_script(
                    page_url, behavior_parameters)
            self._browser_controller.run_behavior(behavior_script, timeout=900)
            if on_screenshot:
                self._browser_controller.scroll_to_top()
                jpeg_bytes = self._browser_controller.screenshot()
                on_screenshot(jpeg_bytes)
            outlinks = self._browser_controller.extract_outlinks()
            ## for each hashtag not already visited:
            ##     navigate_to_hashtag (nothing to wait for so no timeout?)
            ##     if on_screenshot;
            ##         take screenshot (30 sec)
            ##     run behavior (3 min)
            ##     outlinks += retrieve_outlinks (60 sec)
            final_page_url = self._browser_controller.url()
            return final_page_url, outlinks
        except websocket.WebSocketConnectionClosedException as e:
            self.logger.error('websocket closed, did chrome die?')
            raise BrowsingException(e)
        finally:
            self.is_browsing = False